Skip to main content

synth_backend/
arm_encoder.rs

1//! ARM Code Encoder - Converts ARM instructions to binary machine code
2//!
3//! Generates ARM32/Thumb-2 machine code from ARM instruction structures
4
5use synth_core::Result;
6use synth_core::target::FPUPrecision;
7use synth_synthesis::contracts::encoding as encoding_contracts;
8use synth_synthesis::{ArmOp, MemAddr, MveSize, Operand2, QReg, Reg, VfpReg};
9
10/// ARM instruction encoding
11pub struct ArmEncoder {
12    /// Use Thumb mode (vs ARM mode)
13    thumb_mode: bool,
14    /// FPU capability for VFP instruction encoding
15    #[allow(dead_code)]
16    fpu: Option<FPUPrecision>,
17}
18
19impl ArmEncoder {
20    /// Create a new ARM encoder in ARM32 mode
21    pub fn new_arm32() -> Self {
22        Self {
23            thumb_mode: false,
24            fpu: None,
25        }
26    }
27
28    /// Create a new ARM encoder in Thumb-2 mode
29    pub fn new_thumb2() -> Self {
30        Self {
31            thumb_mode: true,
32            fpu: None,
33        }
34    }
35
36    /// Create a new Thumb-2 encoder with FPU capability
37    pub fn new_thumb2_with_fpu(fpu: Option<FPUPrecision>) -> Self {
38        Self {
39            thumb_mode: true,
40            fpu,
41        }
42    }
43
44    /// Encode a single ARM instruction to bytes
45    pub fn encode(&self, op: &ArmOp) -> Result<Vec<u8>> {
46        if self.thumb_mode {
47            self.encode_thumb(op)
48        } else {
49            self.encode_arm(op)
50        }
51    }
52
53    /// Encode an ARM instruction in ARM32 mode (32-bit instructions)
54    fn encode_arm(&self, op: &ArmOp) -> Result<Vec<u8>> {
55        let instr: u32 = match op {
56            // Data processing instructions
57            ArmOp::Add { rd, rn, op2 } => {
58                let rd_bits = reg_to_bits(rd);
59                let rn_bits = reg_to_bits(rn);
60                let (op2_bits, i_flag) = encode_operand2(op2);
61
62                // ADD encoding: cond(4) | 00 | I(1) | 0100 | S(1) | Rn(4) | Rd(4) | operand2(12)
63                0xE0800000 // condition=always(E), opcode=ADD(0100), S=0
64                    | (i_flag << 25)
65                    | (rn_bits << 16)
66                    | (rd_bits << 12)
67                    | op2_bits
68            }
69
70            ArmOp::Sub { rd, rn, op2 } => {
71                let rd_bits = reg_to_bits(rd);
72                let rn_bits = reg_to_bits(rn);
73                let (op2_bits, i_flag) = encode_operand2(op2);
74
75                // SUB encoding: opcode=0010
76                0xE0400000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
77            }
78
79            // i64 support: ADDS, ADC, SUBS, SBC for ARM32
80            ArmOp::Adds { rd, rn, op2 } => {
81                let rd_bits = reg_to_bits(rd);
82                let rn_bits = reg_to_bits(rn);
83                let (op2_bits, i_flag) = encode_operand2(op2);
84
85                // ADDS encoding: opcode=0100, S=1
86                0xE0900000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
87            }
88
89            ArmOp::Adc { rd, rn, op2 } => {
90                let rd_bits = reg_to_bits(rd);
91                let rn_bits = reg_to_bits(rn);
92                let (op2_bits, i_flag) = encode_operand2(op2);
93
94                // ADC encoding: opcode=0101
95                0xE0A00000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
96            }
97
98            ArmOp::Subs { rd, rn, op2 } => {
99                let rd_bits = reg_to_bits(rd);
100                let rn_bits = reg_to_bits(rn);
101                let (op2_bits, i_flag) = encode_operand2(op2);
102
103                // SUBS encoding: opcode=0010, S=1
104                0xE0500000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
105            }
106
107            ArmOp::Sbc { rd, rn, op2 } => {
108                let rd_bits = reg_to_bits(rd);
109                let rn_bits = reg_to_bits(rn);
110                let (op2_bits, i_flag) = encode_operand2(op2);
111
112                // SBC encoding: opcode=0110
113                0xE0C00000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
114            }
115
116            ArmOp::Mul { rd, rn, rm } => {
117                let rd_bits = reg_to_bits(rd);
118                let rn_bits = reg_to_bits(rn);
119                let rm_bits = reg_to_bits(rm);
120
121                // MUL encoding: cond(4) | 000000 | A(1) | S(1) | Rd(4) | Rn(4) | Rs(4) | 1001 | Rm(4)
122                0xE0000090 | (rd_bits << 16) | (rn_bits << 8) | rm_bits
123            }
124
125            ArmOp::Sdiv { rd, rn, rm } => {
126                let rd_bits = reg_to_bits(rd);
127                let rn_bits = reg_to_bits(rn);
128                let rm_bits = reg_to_bits(rm);
129
130                // SDIV encoding: cond(4) | 01110001 | Rd(4) | 1111 | Rm(4) | 0001 | Rn(4)
131                // ARMv7-M and above
132                0xE710F010 | (rd_bits << 16) | (rm_bits << 8) | rn_bits
133            }
134
135            ArmOp::Udiv { rd, rn, rm } => {
136                let rd_bits = reg_to_bits(rd);
137                let rn_bits = reg_to_bits(rn);
138                let rm_bits = reg_to_bits(rm);
139
140                // UDIV encoding: cond(4) | 01110011 | Rd(4) | 1111 | Rm(4) | 0001 | Rn(4)
141                // ARMv7-M and above
142                0xE730F010 | (rd_bits << 16) | (rm_bits << 8) | rn_bits
143            }
144
145            ArmOp::Mls { rd, rn, rm, ra } => {
146                let rd_bits = reg_to_bits(rd);
147                let rn_bits = reg_to_bits(rn);
148                let rm_bits = reg_to_bits(rm);
149                let ra_bits = reg_to_bits(ra);
150
151                // MLS encoding: cond(4) | 00000110 | Rd(4) | Ra(4) | Rm(4) | 1001 | Rn(4)
152                // Rd = Ra - (Rn * Rm)
153                0xE0600090 | (rd_bits << 16) | (ra_bits << 12) | (rm_bits << 8) | rn_bits
154            }
155
156            ArmOp::And { rd, rn, op2 } => {
157                let rd_bits = reg_to_bits(rd);
158                let rn_bits = reg_to_bits(rn);
159                let (op2_bits, i_flag) = encode_operand2(op2);
160
161                // AND encoding: opcode=0000
162                0xE0000000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
163            }
164
165            ArmOp::Orr { rd, rn, op2 } => {
166                let rd_bits = reg_to_bits(rd);
167                let rn_bits = reg_to_bits(rn);
168                let (op2_bits, i_flag) = encode_operand2(op2);
169
170                // ORR encoding: opcode=1100
171                0xE1800000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
172            }
173
174            ArmOp::Eor { rd, rn, op2 } => {
175                let rd_bits = reg_to_bits(rd);
176                let rn_bits = reg_to_bits(rn);
177                let (op2_bits, i_flag) = encode_operand2(op2);
178
179                // EOR encoding: opcode=0001
180                0xE0200000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
181            }
182
183            // Shift instructions
184            ArmOp::Lsl { rd, rn, shift } => {
185                let rd_bits = reg_to_bits(rd);
186                let rn_bits = reg_to_bits(rn);
187                let shift_bits = *shift & 0x1F;
188
189                // LSL encoding: MOV with shift
190                0xE1A00000 | (rd_bits << 12) | (shift_bits << 7) | rn_bits
191            }
192
193            ArmOp::Lsr { rd, rn, shift } => {
194                let rd_bits = reg_to_bits(rd);
195                let rn_bits = reg_to_bits(rn);
196                let shift_bits = *shift & 0x1F;
197
198                // LSR encoding
199                0xE1A00020 | (rd_bits << 12) | (shift_bits << 7) | rn_bits
200            }
201
202            ArmOp::Asr { rd, rn, shift } => {
203                let rd_bits = reg_to_bits(rd);
204                let rn_bits = reg_to_bits(rn);
205                let shift_bits = *shift & 0x1F;
206
207                // ASR encoding
208                0xE1A00040 | (rd_bits << 12) | (shift_bits << 7) | rn_bits
209            }
210
211            ArmOp::Ror { rd, rn, shift } => {
212                let rd_bits = reg_to_bits(rd);
213                let rn_bits = reg_to_bits(rn);
214                let shift_bits = *shift & 0x1F;
215
216                // ROR encoding: MOV with ROR shift
217                0xE1A00060 | (rd_bits << 12) | (shift_bits << 7) | rn_bits
218            }
219
220            // Register-based shifts (ARM32)
221            // LSL Rd, Rn, Rm: cond 0001101S 0000 Rd Rs 0001 Rn
222            ArmOp::LslReg { rd, rn, rm } => {
223                let rd_bits = reg_to_bits(rd);
224                let rn_bits = reg_to_bits(rn);
225                let rm_bits = reg_to_bits(rm);
226                0xE1A00010 | (rd_bits << 12) | (rm_bits << 8) | rn_bits
227            }
228            ArmOp::LsrReg { rd, rn, rm } => {
229                let rd_bits = reg_to_bits(rd);
230                let rn_bits = reg_to_bits(rn);
231                let rm_bits = reg_to_bits(rm);
232                0xE1A00030 | (rd_bits << 12) | (rm_bits << 8) | rn_bits
233            }
234            ArmOp::AsrReg { rd, rn, rm } => {
235                let rd_bits = reg_to_bits(rd);
236                let rn_bits = reg_to_bits(rn);
237                let rm_bits = reg_to_bits(rm);
238                0xE1A00050 | (rd_bits << 12) | (rm_bits << 8) | rn_bits
239            }
240            ArmOp::RorReg { rd, rn, rm } => {
241                let rd_bits = reg_to_bits(rd);
242                let rn_bits = reg_to_bits(rn);
243                let rm_bits = reg_to_bits(rm);
244                0xE1A00070 | (rd_bits << 12) | (rm_bits << 8) | rn_bits
245            }
246
247            // RSB (Reverse Subtract): Rd = imm - Rn
248            ArmOp::Rsb { rd, rn, imm } => {
249                let rd_bits = reg_to_bits(rd);
250                let rn_bits = reg_to_bits(rn);
251                // RSB encoding: cond(4) | 00 1 0011 S | Rn(4) | Rd(4) | imm12
252                // Opcode for RSB = 0011, I=1 (immediate), S=0
253                0xE2600000 | (rn_bits << 16) | (rd_bits << 12) | (*imm & 0xFF)
254            }
255
256            // Bit manipulation instructions
257            ArmOp::Clz { rd, rm } => {
258                let rd_bits = reg_to_bits(rd);
259                let rm_bits = reg_to_bits(rm);
260
261                // CLZ encoding: cond(4) | 00010110 | 1111 | Rd(4) | 1111 | 0001 | Rm(4)
262                // ARMv5T and above
263                0xE16F0F10 | (rd_bits << 12) | rm_bits
264            }
265
266            ArmOp::Rbit { rd, rm } => {
267                let rd_bits = reg_to_bits(rd);
268                let rm_bits = reg_to_bits(rm);
269
270                // RBIT encoding: cond(4) | 01101111 | 1111 | Rd(4) | 1111 | 0011 | Rm(4)
271                // ARMv6T2 and above
272                0xE6FF0F30 | (rd_bits << 12) | rm_bits
273            }
274
275            ArmOp::Sxtb { rd, rm } => {
276                let rd_bits = reg_to_bits(rd);
277                let rm_bits = reg_to_bits(rm);
278
279                // SXTB encoding: cond(4) | 01101010 | 1111 | Rd(4) | rotate(2) | 00 | 0111 | Rm(4)
280                // ARMv6 and above. rotate=00 for no rotation
281                0xE6AF0070 | (rd_bits << 12) | rm_bits
282            }
283
284            ArmOp::Sxth { rd, rm } => {
285                let rd_bits = reg_to_bits(rd);
286                let rm_bits = reg_to_bits(rm);
287
288                // SXTH encoding: cond(4) | 01101011 | 1111 | Rd(4) | rotate(2) | 00 | 0111 | Rm(4)
289                // ARMv6 and above. rotate=00 for no rotation
290                0xE6BF0070 | (rd_bits << 12) | rm_bits
291            }
292
293            // Move instructions
294            ArmOp::Mov { rd, op2 } => {
295                let rd_bits = reg_to_bits(rd);
296                let (op2_bits, i_flag) = encode_operand2(op2);
297
298                // MOV encoding: opcode=1101
299                0xE1A00000 | (i_flag << 25) | (rd_bits << 12) | op2_bits
300            }
301
302            ArmOp::Mvn { rd, op2 } => {
303                let rd_bits = reg_to_bits(rd);
304                let (op2_bits, i_flag) = encode_operand2(op2);
305
306                // MVN encoding: opcode=1111
307                0xE1E00000 | (i_flag << 25) | (rd_bits << 12) | op2_bits
308            }
309
310            // MOVW - Move Wide (ARM32)
311            // Encoding: cond(4) | 0011 0000 | imm4(4) | Rd(4) | imm12(12)
312            ArmOp::Movw { rd, imm16 } => {
313                let rd_bits = reg_to_bits(rd);
314                let imm4 = ((*imm16 as u32) >> 12) & 0xF;
315                let imm12 = (*imm16 as u32) & 0xFFF;
316                0xE3000000 | (imm4 << 16) | (rd_bits << 12) | imm12
317            }
318
319            // MOVT - Move Top (ARM32)
320            // Encoding: cond(4) | 0011 0100 | imm4(4) | Rd(4) | imm12(12)
321            ArmOp::Movt { rd, imm16 } => {
322                let rd_bits = reg_to_bits(rd);
323                let imm4 = ((*imm16 as u32) >> 12) & 0xF;
324                let imm12 = (*imm16 as u32) & 0xFFF;
325                0xE3400000 | (imm4 << 16) | (rd_bits << 12) | imm12
326            }
327
328            // Compare
329            ArmOp::Cmp { rn, op2 } => {
330                let rn_bits = reg_to_bits(rn);
331                let (op2_bits, i_flag) = encode_operand2(op2);
332
333                // CMP encoding: opcode=1010, S=1
334                0xE1500000 | (i_flag << 25) | (rn_bits << 16) | op2_bits
335            }
336
337            // Compare Negative (CMN) - computes Rn + op2 and sets flags
338            ArmOp::Cmn { rn, op2 } => {
339                let rn_bits = reg_to_bits(rn);
340                let (op2_bits, i_flag) = encode_operand2(op2);
341
342                // CMN encoding: opcode=1011, S=1
343                0xE1700000 | (i_flag << 25) | (rn_bits << 16) | op2_bits
344            }
345
346            // Load/Store
347            ArmOp::Ldr { rd, addr } => {
348                let rd_bits = reg_to_bits(rd);
349                let (base_bits, offset_bits) = encode_mem_addr(addr);
350
351                // LDR encoding: cond(4) | 01 | I(1) | P(1) | U(1) | B(1) | W(1) | L(1) | Rn(4) | Rd(4) | offset(12)
352                // P=1 (pre-indexed), U=1 (add offset), L=1 (load)
353                0xE5900000 | (base_bits << 16) | (rd_bits << 12) | offset_bits
354            }
355
356            ArmOp::Str { rd, addr } => {
357                let rd_bits = reg_to_bits(rd);
358                let (base_bits, offset_bits) = encode_mem_addr(addr);
359
360                // STR encoding: L=0 (store)
361                0xE5800000 | (base_bits << 16) | (rd_bits << 12) | offset_bits
362            }
363
364            // Sub-word loads (ARM32 encoding)
365            ArmOp::Ldrb { rd, addr } => {
366                let rd_bits = reg_to_bits(rd);
367                let (base_bits, offset_bits) = encode_mem_addr(addr);
368                // LDRB: LDR with B=1 (byte): cond|01|I|P|U|1|W|L|Rn|Rd|offset
369                0xE5D00000 | (base_bits << 16) | (rd_bits << 12) | offset_bits
370            }
371
372            ArmOp::Ldrsb { rd, addr } => {
373                let rd_bits = reg_to_bits(rd);
374                let (base_bits, offset_bits) = encode_mem_addr(addr);
375                // LDRSB (misc load): cond|000|P|U|1|W|1|Rn|Rd|imm4H|1101|imm4L
376                // Simplified with immediate offset
377                let offset_val = offset_bits & 0xFF;
378                let imm4h = (offset_val >> 4) & 0xF;
379                let imm4l = offset_val & 0xF;
380                0xE1D000D0 | (base_bits << 16) | (rd_bits << 12) | (imm4h << 8) | imm4l
381            }
382
383            ArmOp::Ldrh { rd, addr } => {
384                let rd_bits = reg_to_bits(rd);
385                let (base_bits, offset_bits) = encode_mem_addr(addr);
386                // LDRH (misc load): cond|000|P|U|1|W|1|Rn|Rd|imm4H|1011|imm4L
387                let offset_val = offset_bits & 0xFF;
388                let imm4h = (offset_val >> 4) & 0xF;
389                let imm4l = offset_val & 0xF;
390                0xE1D000B0 | (base_bits << 16) | (rd_bits << 12) | (imm4h << 8) | imm4l
391            }
392
393            ArmOp::Ldrsh { rd, addr } => {
394                let rd_bits = reg_to_bits(rd);
395                let (base_bits, offset_bits) = encode_mem_addr(addr);
396                // LDRSH (misc load): cond|000|P|U|1|W|1|Rn|Rd|imm4H|1111|imm4L
397                let offset_val = offset_bits & 0xFF;
398                let imm4h = (offset_val >> 4) & 0xF;
399                let imm4l = offset_val & 0xF;
400                0xE1D000F0 | (base_bits << 16) | (rd_bits << 12) | (imm4h << 8) | imm4l
401            }
402
403            // Sub-word stores (ARM32 encoding)
404            ArmOp::Strb { rd, addr } => {
405                let rd_bits = reg_to_bits(rd);
406                let (base_bits, offset_bits) = encode_mem_addr(addr);
407                // STRB: STR with B=1 (byte): cond|01|I|P|U|1|W|0|Rn|Rd|offset
408                0xE5C00000 | (base_bits << 16) | (rd_bits << 12) | offset_bits
409            }
410
411            ArmOp::Strh { rd, addr } => {
412                let rd_bits = reg_to_bits(rd);
413                let (base_bits, offset_bits) = encode_mem_addr(addr);
414                // STRH (misc store): cond|000|P|U|1|W|0|Rn|Rd|imm4H|1011|imm4L
415                let offset_val = offset_bits & 0xFF;
416                let imm4h = (offset_val >> 4) & 0xF;
417                let imm4l = offset_val & 0xF;
418                0xE1C000B0 | (base_bits << 16) | (rd_bits << 12) | (imm4h << 8) | imm4l
419            }
420
421            // Memory management (ARM32 encoding)
422            ArmOp::MemorySize { rd } => {
423                let rd_bits = reg_to_bits(rd);
424                // MOV rd, R10, LSR #16  (memory size in bytes / 65536 = pages)
425                // cond|000|1101|S|0000|Rd|shift5|type|0|Rm
426                // LSR #16: shift5=10000, type=01
427                0xE1A00820 | (rd_bits << 12) | 0x0A // Rm=R10, shift=16, LSR
428            }
429
430            ArmOp::MemoryGrow { rd, .. } => {
431                let rd_bits = reg_to_bits(rd);
432                // On embedded, always fail: MOV rd, #-1
433                0xE3E00000 | (rd_bits << 12) // MVN rd, #0 = MOV rd, #-1
434            }
435
436            // Label pseudo-instruction: emits no machine code
437            ArmOp::Label { .. } => {
438                return Ok(Vec::new());
439            }
440
441            // Branch instructions
442            ArmOp::B { label: _ } => {
443                // B encoding: cond(4) | 1010 | offset(24)
444                // Simplified: branch to offset 0 (will be patched by linker/resolver)
445                0xEA000000
446            }
447
448            // Conditional branch to label (generic)
449            ArmOp::Bcc { cond, label: _ } => {
450                use synth_synthesis::Condition;
451                let cond_bits: u32 = match cond {
452                    Condition::EQ => 0x0,
453                    Condition::NE => 0x1,
454                    Condition::HS => 0x2,
455                    Condition::LO => 0x3,
456                    Condition::HI => 0x8,
457                    Condition::LS => 0x9,
458                    Condition::GE => 0xA,
459                    Condition::LT => 0xB,
460                    Condition::GT => 0xC,
461                    Condition::LE => 0xD,
462                };
463                // B<cond> with offset 0 (will be patched)
464                (cond_bits << 28) | 0x0A000000
465            }
466
467            // BHS (Branch if Higher or Same) - used for bounds checking
468            ArmOp::Bhs { label: _ } => {
469                // BHS encoding: cond(2=HS) | 1010 | offset(24)
470                0x2A000000 // BHS with offset 0
471            }
472
473            // BLO (Branch if Lower) - complementary to BHS
474            ArmOp::Blo { label: _ } => {
475                // BLO encoding: cond(3=LO) | 1010 | offset(24)
476                0x3A000000 // BLO with offset 0
477            }
478
479            // Branch with numeric offset (in instructions)
480            // ARM32 B instruction: offset is in instructions, stored as words
481            // The offset is relative to PC+8 (due to ARM pipeline)
482            ArmOp::BOffset { offset } => {
483                // B encoding: cond(4) | 1010 | offset(24)
484                // Offset is signed, in words (4-byte units)
485                // ARM adds PC+8 to the offset, so we need to adjust:
486                // target = PC + 8 + (offset * 4)
487                // For backward branch of N instructions: offset = -(N + 2)
488                // wrapping_sub keeps the encoder total under fuzzing (#186): an
489                // extreme i32::MIN offset would otherwise overflow-panic; for any
490                // real branch offset this is identical to `- 2`.
491                let adjusted_offset = offset.wrapping_sub(2); // Account for PC+8
492                let offset_bits = (adjusted_offset as u32) & 0x00FFFFFF;
493                0xEA000000 | offset_bits
494            }
495
496            // Conditional branch with numeric offset
497            ArmOp::BCondOffset { cond, offset } => {
498                use synth_synthesis::Condition;
499                let cond_bits: u32 = match cond {
500                    Condition::EQ => 0x0,
501                    Condition::NE => 0x1,
502                    Condition::HS => 0x2,
503                    Condition::LO => 0x3,
504                    Condition::HI => 0x8,
505                    Condition::LS => 0x9,
506                    Condition::GE => 0xA,
507                    Condition::LT => 0xB,
508                    Condition::GT => 0xC,
509                    Condition::LE => 0xD,
510                };
511                // B<cond> encoding: cond(4) | 1010 | offset(24)
512                // wrapping_sub: total under fuzzing (#186), identical for real offsets.
513                let adjusted_offset = offset.wrapping_sub(2); // Account for PC+8
514                let offset_bits = (adjusted_offset as u32) & 0x00FFFFFF;
515                (cond_bits << 28) | 0x0A000000 | offset_bits
516            }
517
518            ArmOp::Bl { label: _ } => {
519                // BL encoding: cond(4) | 1011 | offset(24)
520                0xEB000000
521            }
522
523            ArmOp::Bx { rm } => {
524                let rm_bits = reg_to_bits(rm);
525
526                // BX encoding: cond(4) | 000100101111111111110001 | Rm(4)
527                0xE12FFF10 | rm_bits
528            }
529
530            ArmOp::Blx { rm } => {
531                let rm_bits = reg_to_bits(rm);
532
533                // BLX (register) encoding: cond(4) | 000100101111111111110011 | Rm(4)
534                0xE12FFF30 | rm_bits
535            }
536
537            ArmOp::Push { regs } => {
538                // STMDB SP!, {regs} encoding: cond(4) | 100100 | 10 | 1101 | register_list(16)
539                let mut reg_list: u32 = 0;
540                for r in regs {
541                    reg_list |= 1 << reg_to_bits(r);
542                }
543                0xE92D0000 | reg_list
544            }
545
546            ArmOp::Pop { regs } => {
547                // LDMIA SP!, {regs} encoding: cond(4) | 100010 | 11 | 1101 | register_list(16)
548                let mut reg_list: u32 = 0;
549                for r in regs {
550                    reg_list |= 1 << reg_to_bits(r);
551                }
552                0xE8BD0000 | reg_list
553            }
554
555            ArmOp::Nop => {
556                // NOP encoding: MOV R0, R0
557                0xE1A00000
558            }
559
560            ArmOp::Udf { imm } => {
561                // UDF (Undefined) encoding in ARM: 0xE7F000F0 | (imm12_hi << 8) | imm4_lo
562                // We only use imm8, so split into imm4_hi and imm4_lo
563                let imm8 = *imm as u32;
564                0xE7F000F0 | ((imm8 & 0xF0) << 4) | (imm8 & 0x0F)
565            }
566
567            // Pseudo-instructions for verification - encode as NOP
568            // These are used in formal verification but not actual code generation
569            ArmOp::Popcnt { .. } => {
570                // Population count pseudo-instruction
571                // Not a real ARM instruction, would be expanded to actual code
572                0xE1A00000 // NOP for now
573            }
574
575            ArmOp::SetCond { .. } => {
576                // Condition evaluation pseudo-instruction
577                // Not a real ARM instruction, would be expanded to actual code
578                0xE1A00000 // NOP for now
579            }
580
581            ArmOp::SelectMove { .. } => {
582                // Conditional move pseudo-instruction for ARM32
583                // Would use MOV{cond} instruction
584                0xE1A00000 // NOP for now
585            }
586
587            ArmOp::Select { .. } => {
588                // Select pseudo-instruction
589                // Not a real ARM instruction, would be expanded to conditional moves
590                0xE1A00000 // NOP for now
591            }
592
593            ArmOp::LocalGet { .. } => {
594                // Local variable get pseudo-instruction
595                // Not a real ARM instruction, would be expanded to memory access
596                0xE1A00000 // NOP for now
597            }
598
599            ArmOp::LocalSet { .. } => {
600                // Local variable set pseudo-instruction
601                // Not a real ARM instruction, would be expanded to memory access
602                0xE1A00000 // NOP for now
603            }
604
605            ArmOp::LocalTee { .. } => {
606                // Local variable tee pseudo-instruction
607                // Not a real ARM instruction, would be expanded to memory access
608                0xE1A00000 // NOP for now
609            }
610
611            ArmOp::GlobalGet { .. } => {
612                // Global variable get pseudo-instruction
613                // Not a real ARM instruction, would be expanded to memory access
614                0xE1A00000 // NOP for now
615            }
616
617            ArmOp::GlobalSet { .. } => {
618                // Global variable set pseudo-instruction
619                // Not a real ARM instruction, would be expanded to memory access
620                0xE1A00000 // NOP for now
621            }
622
623            ArmOp::BrTable { .. } => {
624                // Branch table pseudo-instruction
625                // Not a real ARM instruction, would be expanded to jump table
626                0xE1A00000 // NOP for now
627            }
628
629            ArmOp::Call { .. } => {
630                // Function call pseudo-instruction
631                // Not a real ARM instruction, would be expanded to BL
632                0xE1A00000 // NOP for now
633            }
634
635            ArmOp::CallIndirect { .. } => {
636                // Indirect function call pseudo-instruction
637                // Not a real ARM instruction, would be expanded to indirect branch
638                0xE1A00000 // NOP for now
639            }
640
641            // i64 pseudo-instructions (Phase 2) - encode as NOP for now
642            // Real compiler would expand these to multi-instruction sequences
643            ArmOp::I64Add { .. } => 0xE1A00000,        // NOP
644            ArmOp::I64Sub { .. } => 0xE1A00000,        // NOP
645            ArmOp::I64DivS { .. } => 0xE1A00000,       // NOP
646            ArmOp::I64DivU { .. } => 0xE1A00000,       // NOP
647            ArmOp::I64RemS { .. } => 0xE1A00000,       // NOP
648            ArmOp::I64RemU { .. } => 0xE1A00000,       // NOP
649            ArmOp::I64Clz { .. } => 0xE1A00000,        // NOP
650            ArmOp::I64Ctz { .. } => 0xE1A00000,        // NOP
651            ArmOp::I64Popcnt { .. } => 0xE1A00000,     // NOP
652            ArmOp::I64And { .. } => 0xE1A00000,        // NOP
653            ArmOp::I64Or { .. } => 0xE1A00000,         // NOP
654            ArmOp::I64Xor { .. } => 0xE1A00000,        // NOP
655            ArmOp::I64Eqz { .. } => 0xE1A00000,        // NOP
656            ArmOp::I64Eq { .. } => 0xE1A00000,         // NOP
657            ArmOp::I64Ne { .. } => 0xE1A00000,         // NOP
658            ArmOp::I64LtS { .. } => 0xE1A00000,        // NOP
659            ArmOp::I64LtU { .. } => 0xE1A00000,        // NOP
660            ArmOp::I64LeS { .. } => 0xE1A00000,        // NOP
661            ArmOp::I64LeU { .. } => 0xE1A00000,        // NOP
662            ArmOp::I64GtS { .. } => 0xE1A00000,        // NOP
663            ArmOp::I64GtU { .. } => 0xE1A00000,        // NOP
664            ArmOp::I64GeS { .. } => 0xE1A00000,        // NOP
665            ArmOp::I64GeU { .. } => 0xE1A00000,        // NOP
666            ArmOp::I64Const { .. } => 0xE1A00000,      // NOP
667            ArmOp::I64Ldr { .. } => 0xE1A00000,        // NOP
668            ArmOp::I64Str { .. } => 0xE1A00000,        // NOP
669            ArmOp::I64ExtendI32S { .. } => 0xE1A00000, // NOP
670            ArmOp::I64ExtendI32U { .. } => 0xE1A00000, // NOP
671            ArmOp::I64Extend8S { .. } => 0xE1A00000,   // NOP (Thumb-2 only)
672            ArmOp::I64Extend16S { .. } => 0xE1A00000,  // NOP (Thumb-2 only)
673            ArmOp::I64Extend32S { .. } => 0xE1A00000,  // NOP (Thumb-2 only)
674            ArmOp::I32WrapI64 { .. } => 0xE1A00000,    // NOP
675
676            // f32 VFP single-precision instructions
677            ArmOp::F32Add { sd, sn, sm } => encode_vfp_3reg(0xEE300A00, sd, sn, sm)?,
678            ArmOp::F32Sub { sd, sn, sm } => encode_vfp_3reg(0xEE300A40, sd, sn, sm)?,
679            ArmOp::F32Mul { sd, sn, sm } => encode_vfp_3reg(0xEE200A00, sd, sn, sm)?,
680            ArmOp::F32Div { sd, sn, sm } => encode_vfp_3reg(0xEE800A00, sd, sn, sm)?,
681            ArmOp::F32Abs { sd, sm } => encode_vfp_2reg(0xEEB00AC0, sd, sm)?,
682            ArmOp::F32Neg { sd, sm } => encode_vfp_2reg(0xEEB10A40, sd, sm)?,
683            ArmOp::F32Sqrt { sd, sm } => encode_vfp_2reg(0xEEB10AC0, sd, sm)?,
684
685            // f32 pseudo-ops — multi-instruction sequences
686            // FPSCR RMode: 00=nearest, 01=+inf(ceil), 10=-inf(floor), 11=zero(trunc)
687            ArmOp::F32Ceil { sd, sm } => {
688                return self.encode_arm_f32_rounding(sd, sm, 0b01); // Round toward +Inf
689            }
690            ArmOp::F32Floor { sd, sm } => {
691                return self.encode_arm_f32_rounding(sd, sm, 0b10); // Round toward -Inf
692            }
693            ArmOp::F32Trunc { sd, sm } => {
694                return self.encode_arm_f32_rounding(sd, sm, 0b11); // VCVT toward zero
695            }
696            ArmOp::F32Nearest { sd, sm } => {
697                return self.encode_arm_f32_rounding(sd, sm, 0b00); // VCVT to nearest
698            }
699            ArmOp::F32Min { sd, sn, sm } => {
700                return self.encode_arm_f32_minmax(sd, sn, sm, true);
701            }
702            ArmOp::F32Max { sd, sn, sm } => {
703                return self.encode_arm_f32_minmax(sd, sn, sm, false);
704            }
705            ArmOp::F32Copysign { sd, sn, sm } => {
706                return self.encode_arm_f32_copysign(sd, sn, sm);
707            }
708
709            // f32 comparisons — multi-instruction: VCMP + VMRS + conditional MOV
710            ArmOp::F32Eq { rd, sn, sm } => {
711                return self.encode_arm_f32_compare(rd, sn, sm, 0x0); // EQ
712            }
713            ArmOp::F32Ne { rd, sn, sm } => {
714                return self.encode_arm_f32_compare(rd, sn, sm, 0x1); // NE
715            }
716            ArmOp::F32Lt { rd, sn, sm } => {
717                return self.encode_arm_f32_compare(rd, sn, sm, 0x4); // MI (less than)
718            }
719            ArmOp::F32Le { rd, sn, sm } => {
720                return self.encode_arm_f32_compare(rd, sn, sm, 0x9); // LS (less or same)
721            }
722            ArmOp::F32Gt { rd, sn, sm } => {
723                return self.encode_arm_f32_compare(rd, sn, sm, 0xC); // GT
724            }
725            ArmOp::F32Ge { rd, sn, sm } => {
726                return self.encode_arm_f32_compare(rd, sn, sm, 0xA); // GE
727            }
728
729            // f32 const — multi-instruction: MOVW + MOVT + VMOV
730            ArmOp::F32Const { sd, value } => {
731                return self.encode_arm_f32_const(sd, *value);
732            }
733
734            ArmOp::F32Load { sd, addr } => encode_vfp_ldst(0xED900A00, sd, addr)?,
735            ArmOp::F32Store { sd, addr } => encode_vfp_ldst(0xED800A00, sd, addr)?,
736
737            // f32 conversions — multi-instruction sequences
738            ArmOp::F32ConvertI32S { sd, rm } => {
739                return self.encode_arm_f32_convert_i32(sd, rm, true);
740            }
741            ArmOp::F32ConvertI32U { sd, rm } => {
742                return self.encode_arm_f32_convert_i32(sd, rm, false);
743            }
744            ArmOp::F32ConvertI64S { .. } | ArmOp::F32ConvertI64U { .. } => {
745                return Err(synth_core::Error::synthesis(
746                    "F32 i64 conversion not supported (requires register pairs on 32-bit ARM)",
747                ));
748            }
749            ArmOp::F32ReinterpretI32 { sd, rm } => encode_vmov_core_sreg(true, sd, rm)?,
750            ArmOp::I32ReinterpretF32 { rd, sm } => encode_vmov_core_sreg(false, sm, rd)?,
751            ArmOp::I32TruncF32S { rd, sm } => {
752                return self.encode_arm_i32_trunc_f32(rd, sm, true);
753            }
754            ArmOp::I32TruncF32U { rd, sm } => {
755                return self.encode_arm_i32_trunc_f32(rd, sm, false);
756            }
757
758            // f64 VFP double-precision instructions (ARM32)
759            // F64 arithmetic: same as F32 but with sz=1 (bit 8 = 1, cp11 = 0xB)
760            ArmOp::F64Add { dd, dn, dm } => encode_vfp_3reg_f64(0xEE300B00, dd, dn, dm)?,
761            ArmOp::F64Sub { dd, dn, dm } => encode_vfp_3reg_f64(0xEE300B40, dd, dn, dm)?,
762            ArmOp::F64Mul { dd, dn, dm } => encode_vfp_3reg_f64(0xEE200B00, dd, dn, dm)?,
763            ArmOp::F64Div { dd, dn, dm } => encode_vfp_3reg_f64(0xEE800B00, dd, dn, dm)?,
764            ArmOp::F64Abs { dd, dm } => encode_vfp_2reg_f64(0xEEB00BC0, dd, dm)?,
765            ArmOp::F64Neg { dd, dm } => encode_vfp_2reg_f64(0xEEB10B40, dd, dm)?,
766            ArmOp::F64Sqrt { dd, dm } => encode_vfp_2reg_f64(0xEEB10BC0, dd, dm)?,
767
768            // f64 pseudo-ops
769            // FPSCR RMode: 00=nearest, 01=+inf(ceil), 10=-inf(floor), 11=zero(trunc)
770            ArmOp::F64Ceil { dd, dm } => {
771                return self.encode_arm_f64_rounding(dd, dm, 0b01);
772            }
773            ArmOp::F64Floor { dd, dm } => {
774                return self.encode_arm_f64_rounding(dd, dm, 0b10);
775            }
776            ArmOp::F64Trunc { dd, dm } => {
777                return self.encode_arm_f64_rounding(dd, dm, 0b11);
778            }
779            ArmOp::F64Nearest { dd, dm } => {
780                return self.encode_arm_f64_rounding(dd, dm, 0b00);
781            }
782            ArmOp::F64Min { dd, dn, dm } => {
783                return self.encode_arm_f64_minmax(dd, dn, dm, true);
784            }
785            ArmOp::F64Max { dd, dn, dm } => {
786                return self.encode_arm_f64_minmax(dd, dn, dm, false);
787            }
788            ArmOp::F64Copysign { dd, dn, dm } => {
789                return self.encode_arm_f64_copysign(dd, dn, dm);
790            }
791
792            // f64 comparisons
793            ArmOp::F64Eq { rd, dn, dm } => {
794                return self.encode_arm_f64_compare(rd, dn, dm, 0x0);
795            }
796            ArmOp::F64Ne { rd, dn, dm } => {
797                return self.encode_arm_f64_compare(rd, dn, dm, 0x1);
798            }
799            ArmOp::F64Lt { rd, dn, dm } => {
800                return self.encode_arm_f64_compare(rd, dn, dm, 0x4);
801            }
802            ArmOp::F64Le { rd, dn, dm } => {
803                return self.encode_arm_f64_compare(rd, dn, dm, 0x9);
804            }
805            ArmOp::F64Gt { rd, dn, dm } => {
806                return self.encode_arm_f64_compare(rd, dn, dm, 0xC);
807            }
808            ArmOp::F64Ge { rd, dn, dm } => {
809                return self.encode_arm_f64_compare(rd, dn, dm, 0xA);
810            }
811
812            ArmOp::F64Const { dd, value } => {
813                return self.encode_arm_f64_const(dd, *value);
814            }
815
816            ArmOp::F64Load { dd, addr } => encode_vfp_ldst_f64(0xED900B00, dd, addr)?,
817            ArmOp::F64Store { dd, addr } => encode_vfp_ldst_f64(0xED800B00, dd, addr)?,
818
819            ArmOp::F64ConvertI32S { dd, rm } => {
820                return self.encode_arm_f64_convert_i32(dd, rm, true);
821            }
822            ArmOp::F64ConvertI32U { dd, rm } => {
823                return self.encode_arm_f64_convert_i32(dd, rm, false);
824            }
825            ArmOp::F64ConvertI64S { .. } | ArmOp::F64ConvertI64U { .. } => {
826                return Err(synth_core::Error::synthesis(
827                    "F64 i64 conversion not supported (requires register pairs on 32-bit ARM)",
828                ));
829            }
830            ArmOp::F64PromoteF32 { dd, sm } => {
831                return self.encode_arm_f64_promote_f32(dd, sm);
832            }
833            ArmOp::F64ReinterpretI64 { dd, rmlo, rmhi } => {
834                encode_vmov_core_dreg(true, dd, rmlo, rmhi)?
835            }
836            ArmOp::I64ReinterpretF64 { rdlo, rdhi, dm } => {
837                encode_vmov_core_dreg(false, dm, rdlo, rdhi)?
838            }
839            ArmOp::I64TruncF64S { .. } | ArmOp::I64TruncF64U { .. } => {
840                return Err(synth_core::Error::synthesis(
841                    "i64 truncation from F64 not supported (requires i64 register pairs on 32-bit ARM)",
842                ));
843            }
844            ArmOp::I32TruncF64S { rd, dm } => {
845                return self.encode_arm_i32_trunc_f64(rd, dm, true);
846            }
847            ArmOp::I32TruncF64U { rd, dm } => {
848                return self.encode_arm_i32_trunc_f64(rd, dm, false);
849            }
850            // Multi-instruction sequences - only meaningful in Thumb-2 mode
851            ArmOp::I64SetCond { .. }
852            | ArmOp::I64SetCondZ { .. }
853            | ArmOp::I64Mul { .. }
854            | ArmOp::I64Shl { .. }
855            | ArmOp::I64ShrS { .. }
856            | ArmOp::I64ShrU { .. }
857            | ArmOp::I64Rotl { .. }
858            | ArmOp::I64Rotr { .. } => 0xE1A00000, // NOP (Thumb-2 only)
859
860            // MVE instructions — Thumb-2 only (Cortex-M55 is always Thumb-2)
861            ArmOp::MveLoad { .. }
862            | ArmOp::MveStore { .. }
863            | ArmOp::MveConst { .. }
864            | ArmOp::MveAnd { .. }
865            | ArmOp::MveOrr { .. }
866            | ArmOp::MveEor { .. }
867            | ArmOp::MveMvn { .. }
868            | ArmOp::MveBic { .. }
869            | ArmOp::MveAddI { .. }
870            | ArmOp::MveSubI { .. }
871            | ArmOp::MveMulI { .. }
872            | ArmOp::MveNegI { .. }
873            | ArmOp::MveCmpEqI { .. }
874            | ArmOp::MveCmpNeI { .. }
875            | ArmOp::MveCmpLtS { .. }
876            | ArmOp::MveCmpLtU { .. }
877            | ArmOp::MveCmpGtS { .. }
878            | ArmOp::MveCmpGtU { .. }
879            | ArmOp::MveCmpLeS { .. }
880            | ArmOp::MveCmpLeU { .. }
881            | ArmOp::MveCmpGeS { .. }
882            | ArmOp::MveCmpGeU { .. }
883            | ArmOp::MveDup { .. }
884            | ArmOp::MveExtractLane { .. }
885            | ArmOp::MveInsertLane { .. }
886            | ArmOp::MveAddF32 { .. }
887            | ArmOp::MveSubF32 { .. }
888            | ArmOp::MveMulF32 { .. }
889            | ArmOp::MveNegF32 { .. }
890            | ArmOp::MveAbsF32 { .. }
891            | ArmOp::MveCmpEqF32 { .. }
892            | ArmOp::MveCmpNeF32 { .. }
893            | ArmOp::MveCmpLtF32 { .. }
894            | ArmOp::MveCmpLeF32 { .. }
895            | ArmOp::MveCmpGtF32 { .. }
896            | ArmOp::MveCmpGeF32 { .. }
897            | ArmOp::MveDupF32 { .. }
898            | ArmOp::MveExtractLaneF32 { .. }
899            | ArmOp::MveReplaceLaneF32 { .. }
900            | ArmOp::MveDivF32 { .. }
901            | ArmOp::MveSqrtF32 { .. } => 0xE1A00000, // NOP (MVE = Thumb-2 only)
902        };
903
904        // ARM32 instructions are little-endian
905        Ok(instr.to_le_bytes().to_vec())
906    }
907
908    // === ARM32 VFP multi-instruction helpers ===
909
910    /// Encode F32 comparison as ARM32: VCMP.F32 + VMRS + MOV rd,#0 + MOVcond rd,#1
911    fn encode_arm_f32_compare(
912        &self,
913        rd: &Reg,
914        sn: &VfpReg,
915        sm: &VfpReg,
916        cond_code: u32,
917    ) -> Result<Vec<u8>> {
918        let mut bytes = Vec::new();
919
920        // VCMP.F32 Sn, Sm: 0xEEB40A40 with Sn in Vd position, Sm in Vm position
921        let sn_num = vfp_sreg_to_num(sn)?;
922        let sm_num = vfp_sreg_to_num(sm)?;
923        let (vd, d) = encode_sreg(sn_num);
924        let (vm, m) = encode_sreg(sm_num);
925        let vcmp = 0xEEB40A40 | (d << 22) | (vd << 12) | (m << 5) | vm;
926        bytes.extend_from_slice(&vcmp.to_le_bytes());
927
928        // VMRS APSR_nzcv, FPSCR: 0xEEF1FA10
929        bytes.extend_from_slice(&0xEEF1FA10u32.to_le_bytes());
930
931        // MOV rd, #0: 0xE3A0_0000 | (rd << 12)
932        let rd_bits = reg_to_bits(rd);
933        let mov_zero = 0xE3A00000 | (rd_bits << 12);
934        bytes.extend_from_slice(&mov_zero.to_le_bytes());
935
936        // MOVcond rd, #1: cond(4) | 0011 1010 0000 rd(4) 0000 0000 0001
937        let mov_one = (cond_code << 28) | 0x03A00001 | (rd_bits << 12);
938        bytes.extend_from_slice(&mov_one.to_le_bytes());
939
940        Ok(bytes)
941    }
942
943    /// Encode F32 constant load as ARM32: MOVW Rt,#lo16 + MOVT Rt,#hi16 + VMOV Sd,Rt
944    fn encode_arm_f32_const(&self, sd: &VfpReg, value: f32) -> Result<Vec<u8>> {
945        let mut bytes = Vec::new();
946        let bits = value.to_bits();
947
948        // Use R12 as temp register for constant loading
949        let rt: u32 = 12; // R12/IP
950
951        // MOVW R12, #lo16: 0xE300_C000 | (imm4 << 16) | imm12
952        let lo16 = bits & 0xFFFF;
953        let movw = 0xE3000000 | (rt << 12) | ((lo16 >> 12) << 16) | (lo16 & 0xFFF);
954        bytes.extend_from_slice(&movw.to_le_bytes());
955
956        // MOVT R12, #hi16: 0xE340_C000 | (imm4 << 16) | imm12
957        let hi16 = (bits >> 16) & 0xFFFF;
958        let movt = 0xE3400000 | (rt << 12) | ((hi16 >> 12) << 16) | (hi16 & 0xFFF);
959        bytes.extend_from_slice(&movt.to_le_bytes());
960
961        // VMOV Sd, R12
962        let vmov = encode_vmov_core_sreg(true, sd, &Reg::R12)?;
963        bytes.extend_from_slice(&vmov.to_le_bytes());
964
965        Ok(bytes)
966    }
967
968    /// Encode VMOV + VCVT.F32.S32/U32 as ARM32
969    fn encode_arm_f32_convert_i32(&self, sd: &VfpReg, rm: &Reg, signed: bool) -> Result<Vec<u8>> {
970        let mut bytes = Vec::new();
971
972        // VMOV Sd, Rm — move integer to VFP register
973        let vmov = encode_vmov_core_sreg(true, sd, rm)?;
974        bytes.extend_from_slice(&vmov.to_le_bytes());
975
976        // VCVT.F32.S32 Sd, Sd (signed) or VCVT.F32.U32 Sd, Sd (unsigned)
977        // Base: 0xEEB80A40 (signed) or 0xEEB80AC0 (unsigned)
978        let sd_num = vfp_sreg_to_num(sd)?;
979        let (vd, d) = encode_sreg(sd_num);
980        let (vm, m) = encode_sreg(sd_num); // same register as source
981        let base = if signed { 0xEEB80A40 } else { 0xEEB80AC0 };
982        let vcvt = base | (d << 22) | (vd << 12) | (m << 5) | vm;
983        bytes.extend_from_slice(&vcvt.to_le_bytes());
984
985        Ok(bytes)
986    }
987
988    /// Encode F32 rounding pseudo-op as ARM32 via VCVT to integer and back.
989    /// mode: 0b00=nearest, 0b01=floor(-Inf), 0b10=ceil(+Inf), 0b11=trunc(zero)
990    /// Strategy: VCVT.S32.F32 Sd, Sm (toward zero), then VCVT.F32.S32 Sd, Sd
991    /// For ceil/floor/nearest, we use VCVTR (round toward mode) + convert back.
992    /// Simplified: convert to int (toward zero for trunc) then back to float.
993    /// Encode F32 rounding as ARM32.
994    /// `mode`: FPSCR RMode — 0b00=nearest, 0b01=+inf(ceil), 0b10=-inf(floor), 0b11=zero(trunc)
995    ///
996    /// For trunc (mode=0b11): uses VCVTR.S32.F32 (always rounds toward zero).
997    /// For ceil/floor/nearest: sets FPSCR rounding mode, uses VCVT.S32.F32 (non-R variant
998    /// which honours FPSCR rmode), then restores FPSCR.
999    fn encode_arm_f32_rounding(&self, sd: &VfpReg, sm: &VfpReg, mode: u8) -> Result<Vec<u8>> {
1000        let mut bytes = Vec::new();
1001        let sm_num = vfp_sreg_to_num(sm)?;
1002        let sd_num = vfp_sreg_to_num(sd)?;
1003        let (vd_s, d_s) = encode_sreg(sd_num);
1004        let (vm_s, m_s) = encode_sreg(sm_num);
1005
1006        if mode == 0b11 {
1007            // Trunc (toward zero): VCVTR.S32.F32 — the "R" variant always truncates.
1008            // 0xEEBD0AC0: bit[7]=1 => round toward zero regardless of FPSCR
1009            let vcvt_to_int = 0xEEBD0AC0 | (d_s << 22) | (vd_s << 12) | (m_s << 5) | vm_s;
1010            bytes.extend_from_slice(&vcvt_to_int.to_le_bytes());
1011        } else {
1012            // ceil/floor/nearest: manipulate FPSCR rounding mode
1013            let rt: u32 = 12; // R12/IP as temp
1014
1015            // VMRS R12, FPSCR
1016            let vmrs = 0xEEF10A10 | (rt << 12);
1017            bytes.extend_from_slice(&vmrs.to_le_bytes());
1018
1019            // BIC R12, R12, #(3 << 22) — clear RMode bits [23:22]
1020            // 3<<22 = 0x00C00000. ARM rotated imm: 0x03 ror 10 (rotation=5, imm8=0x03)
1021            let bic = 0xE3CC0000 | (rt << 12) | (0x05 << 8) | 0x03;
1022            bytes.extend_from_slice(&bic.to_le_bytes());
1023
1024            // ORR R12, R12, #(mode << 22) — set desired rounding mode
1025            if mode != 0 {
1026                // mode<<22: rotation=5, imm8=mode
1027                let orr = 0xE38C0000 | (rt << 12) | (0x05 << 8) | (mode as u32);
1028                bytes.extend_from_slice(&orr.to_le_bytes());
1029            }
1030
1031            // VMSR FPSCR, R12
1032            let vmsr = 0xEEE10A10 | (rt << 12);
1033            bytes.extend_from_slice(&vmsr.to_le_bytes());
1034
1035            // VCVT.S32.F32 Sd, Sm — non-R variant (bit[7]=0), uses FPSCR rounding mode
1036            let vcvt_to_int = 0xEEBD0A40 | (d_s << 22) | (vd_s << 12) | (m_s << 5) | vm_s;
1037            bytes.extend_from_slice(&vcvt_to_int.to_le_bytes());
1038
1039            // Restore FPSCR: clear rmode bits back to nearest (default)
1040            bytes.extend_from_slice(&vmrs.to_le_bytes());
1041            bytes.extend_from_slice(&bic.to_le_bytes());
1042            bytes.extend_from_slice(&vmsr.to_le_bytes());
1043        }
1044
1045        // VCVT.F32.S32 Sd, Sd (convert integer result back to float)
1046        let (vd2, d2) = encode_sreg(sd_num);
1047        let vcvt_to_float = 0xEEB80A40 | (d2 << 22) | (vd2 << 12) | (d_s << 5) | vd_s;
1048        bytes.extend_from_slice(&vcvt_to_float.to_le_bytes());
1049
1050        Ok(bytes)
1051    }
1052
1053    /// Encode F32 min/max as ARM32: VCMP + VMRS + conditional VMOV
1054    fn encode_arm_f32_minmax(
1055        &self,
1056        sd: &VfpReg,
1057        sn: &VfpReg,
1058        sm: &VfpReg,
1059        is_min: bool,
1060    ) -> Result<Vec<u8>> {
1061        let mut bytes = Vec::new();
1062        let sn_num = vfp_sreg_to_num(sn)?;
1063        let sm_num = vfp_sreg_to_num(sm)?;
1064        let sd_num = vfp_sreg_to_num(sd)?;
1065
1066        // VMOV Sd, Sn (start with first operand)
1067        let (vd, d) = encode_sreg(sd_num);
1068        let (vn, n) = encode_sreg(sn_num);
1069        let vmov_sn = 0xEEB00A40 | (d << 22) | (vd << 12) | (n << 5) | vn;
1070        bytes.extend_from_slice(&vmov_sn.to_le_bytes());
1071
1072        // VCMP.F32 Sn, Sm
1073        let (vm, m) = encode_sreg(sm_num);
1074        let vcmp = 0xEEB40A40 | (n << 22) | (vn << 12) | (m << 5) | vm;
1075        bytes.extend_from_slice(&vcmp.to_le_bytes());
1076
1077        // VMRS APSR_nzcv, FPSCR
1078        bytes.extend_from_slice(&0xEEF1FA10u32.to_le_bytes());
1079
1080        // For min: if Sn > Sm (GT), use Sm. Condition = GT (0xC)
1081        // For max: if Sn < Sm (MI/LT), use Sm. Condition = MI (0x4)
1082        let cond = if is_min { 0xCu32 } else { 0x4u32 };
1083
1084        // VMOV{cond} Sd, Sm — conditional VMOV
1085        let vmov_cond = (cond << 28) | 0x0EB00A40 | (d << 22) | (vd << 12) | (m << 5) | vm;
1086        bytes.extend_from_slice(&vmov_cond.to_le_bytes());
1087
1088        Ok(bytes)
1089    }
1090
1091    /// Encode F32 copysign as ARM32: extract sign from Sm, magnitude from Sn
1092    fn encode_arm_f32_copysign(&self, sd: &VfpReg, sn: &VfpReg, sm: &VfpReg) -> Result<Vec<u8>> {
1093        let mut bytes = Vec::new();
1094
1095        // VMOV R12, Sm (get sign source bits)
1096        let vmov_sm = encode_vmov_core_sreg(false, sm, &Reg::R12)?;
1097        bytes.extend_from_slice(&vmov_sm.to_le_bytes());
1098
1099        // VMOV R0, Sn (get magnitude source bits) — use R0 as temp
1100        let vmov_sn = encode_vmov_core_sreg(false, sn, &Reg::R0)?;
1101        bytes.extend_from_slice(&vmov_sn.to_le_bytes());
1102
1103        // AND R12, R12, #0x80000000 (keep only sign bit)
1104        // Thumb-2 constant 0x80000000 needs special encoding; in ARM32 use rotated imm
1105        // 0x80000000 = 0x02 rotated right by 2 (rotation=1, imm8=0x02)
1106        let and_sign = 0xE2000000u32 | (12 << 16) | (12 << 12) | (1 << 8) | 0x02;
1107        bytes.extend_from_slice(&and_sign.to_le_bytes());
1108
1109        // BIC R0, R0, #0x80000000 (clear sign bit from magnitude)
1110        // R0 = register 0, so Rn and Rd fields are 0
1111        let bic_sign = 0xE3C00000u32 | (1 << 8) | 0x02;
1112        bytes.extend_from_slice(&bic_sign.to_le_bytes());
1113
1114        // ORR R0, R0, R12 (combine sign + magnitude)
1115        // R0 = register 0, so Rn and Rd fields are 0
1116        let orr = 0xE1800000u32 | 12;
1117        bytes.extend_from_slice(&orr.to_le_bytes());
1118
1119        // VMOV Sd, R0
1120        let vmov_result = encode_vmov_core_sreg(true, sd, &Reg::R0)?;
1121        bytes.extend_from_slice(&vmov_result.to_le_bytes());
1122
1123        Ok(bytes)
1124    }
1125
1126    /// Encode F64 comparison as ARM32: VCMP.F64 + VMRS + MOV rd,#0 + MOVcond rd,#1
1127    fn encode_arm_f64_compare(
1128        &self,
1129        rd: &Reg,
1130        dn: &VfpReg,
1131        dm: &VfpReg,
1132        cond_code: u32,
1133    ) -> Result<Vec<u8>> {
1134        let mut bytes = Vec::new();
1135
1136        // VCMP.F64 Dn, Dm: 0xEEB40B40 with Dn in Vd position, Dm in Vm position
1137        let dn_num = vfp_dreg_to_num(dn)?;
1138        let dm_num = vfp_dreg_to_num(dm)?;
1139        let (vd, d) = encode_dreg(dn_num);
1140        let (vm, m) = encode_dreg(dm_num);
1141        let vcmp = 0xEEB40B40 | (d << 22) | (vd << 12) | (m << 5) | vm;
1142        bytes.extend_from_slice(&vcmp.to_le_bytes());
1143
1144        // VMRS APSR_nzcv, FPSCR
1145        bytes.extend_from_slice(&0xEEF1FA10u32.to_le_bytes());
1146
1147        // MOV rd, #0
1148        let rd_bits = reg_to_bits(rd);
1149        let mov_zero = 0xE3A00000 | (rd_bits << 12);
1150        bytes.extend_from_slice(&mov_zero.to_le_bytes());
1151
1152        // MOVcond rd, #1
1153        let mov_one = (cond_code << 28) | 0x03A00001 | (rd_bits << 12);
1154        bytes.extend_from_slice(&mov_one.to_le_bytes());
1155
1156        Ok(bytes)
1157    }
1158
1159    /// Encode F64 constant load as ARM32: MOVW + MOVT + MOVW + MOVT + VMOV
1160    fn encode_arm_f64_const(&self, dd: &VfpReg, value: f64) -> Result<Vec<u8>> {
1161        let mut bytes = Vec::new();
1162        let bits = value.to_bits();
1163        let lo32 = bits as u32;
1164        let hi32 = (bits >> 32) as u32;
1165
1166        // Load low 32 bits into R0 (Rd field = 0 for R0)
1167        let lo16 = lo32 & 0xFFFF;
1168        let movw_r0 = 0xE3000000 | ((lo16 >> 12) << 16) | (lo16 & 0xFFF);
1169        bytes.extend_from_slice(&movw_r0.to_le_bytes());
1170        let hi16 = (lo32 >> 16) & 0xFFFF;
1171        let movt_r0 = 0xE3400000 | ((hi16 >> 12) << 16) | (hi16 & 0xFFF);
1172        bytes.extend_from_slice(&movt_r0.to_le_bytes());
1173
1174        // Load high 32 bits into R12
1175        let lo16 = hi32 & 0xFFFF;
1176        let movw_r12 = 0xE3000000 | ((lo16 >> 12) << 16) | (12 << 12) | (lo16 & 0xFFF);
1177        bytes.extend_from_slice(&movw_r12.to_le_bytes());
1178        let hi16 = (hi32 >> 16) & 0xFFFF;
1179        let movt_r12 = 0xE3400000 | ((hi16 >> 12) << 16) | (12 << 12) | (hi16 & 0xFFF);
1180        bytes.extend_from_slice(&movt_r12.to_le_bytes());
1181
1182        // VMOV Dd, R0, R12
1183        let vmov = encode_vmov_core_dreg(true, dd, &Reg::R0, &Reg::R12)?;
1184        bytes.extend_from_slice(&vmov.to_le_bytes());
1185
1186        Ok(bytes)
1187    }
1188
1189    /// Encode VMOV Sd, Rm + VCVT.F64.S32/U32 Dd, Sd as ARM32
1190    fn encode_arm_f64_convert_i32(&self, dd: &VfpReg, rm: &Reg, signed: bool) -> Result<Vec<u8>> {
1191        let mut bytes = Vec::new();
1192
1193        // Use S0 as intermediate: VMOV S0, Rm
1194        let vmov = encode_vmov_core_sreg(true, &VfpReg::S0, rm)?;
1195        bytes.extend_from_slice(&vmov.to_le_bytes());
1196
1197        // VCVT.F64.S32 Dd, S0 (signed) or VCVT.F64.U32 Dd, S0 (unsigned)
1198        // Base: 0xEEB80B40 (signed) or 0xEEB80BC0 (unsigned)
1199        let dd_num = vfp_dreg_to_num(dd)?;
1200        let (vd, d) = encode_dreg(dd_num);
1201        let base = if signed { 0xEEB80B40 } else { 0xEEB80BC0 };
1202        // S0 is register 0: Vm=0, M=0
1203        let vcvt = base | (d << 22) | (vd << 12);
1204        bytes.extend_from_slice(&vcvt.to_le_bytes());
1205
1206        Ok(bytes)
1207    }
1208
1209    /// Encode VCVT.F64.F32 Dd, Sm as ARM32 (f32 to f64 promotion)
1210    fn encode_arm_f64_promote_f32(&self, dd: &VfpReg, sm: &VfpReg) -> Result<Vec<u8>> {
1211        let dd_num = vfp_dreg_to_num(dd)?;
1212        let sm_num = vfp_sreg_to_num(sm)?;
1213        let (vd, d) = encode_dreg(dd_num);
1214        let (vm, m) = encode_sreg(sm_num);
1215
1216        // VCVT.F64.F32 Dd, Sm: 0xEEB70AC0
1217        let vcvt = 0xEEB70AC0 | (d << 22) | (vd << 12) | (m << 5) | vm;
1218        Ok(vcvt.to_le_bytes().to_vec())
1219    }
1220
1221    /// Encode VCVT.S32/U32.F64 Sd, Dm + VMOV Rd, Sd as ARM32
1222    fn encode_arm_i32_trunc_f64(&self, rd: &Reg, dm: &VfpReg, signed: bool) -> Result<Vec<u8>> {
1223        let mut bytes = Vec::new();
1224        let dm_num = vfp_dreg_to_num(dm)?;
1225        let (vm, m) = encode_dreg(dm_num);
1226
1227        // VCVT.S32.F64 S0, Dm (toward zero) or VCVT.U32.F64 S0, Dm
1228        // S0: Vd=0, D=0
1229        let base = if signed { 0xEEBD0BC0 } else { 0xEEBC0BC0 };
1230        let vcvt = base | (m << 5) | vm;
1231        bytes.extend_from_slice(&vcvt.to_le_bytes());
1232
1233        // VMOV Rd, S0
1234        let vmov = encode_vmov_core_sreg(false, &VfpReg::S0, rd)?;
1235        bytes.extend_from_slice(&vmov.to_le_bytes());
1236
1237        Ok(bytes)
1238    }
1239
1240    /// Encode F64 rounding pseudo-op as ARM32 via VCVT to integer and back.
1241    /// Encode F64 rounding as ARM32.
1242    /// `mode`: FPSCR RMode — 0b00=nearest, 0b01=+inf(ceil), 0b10=-inf(floor), 0b11=zero(trunc)
1243    ///
1244    /// For trunc: uses VCVTR.S32.F64 (always truncates).
1245    /// For ceil/floor/nearest: sets FPSCR rounding mode, uses VCVT.S32.F64 (non-R variant),
1246    /// then restores FPSCR.
1247    fn encode_arm_f64_rounding(&self, dd: &VfpReg, dm: &VfpReg, mode: u8) -> Result<Vec<u8>> {
1248        let mut bytes = Vec::new();
1249        let dm_num = vfp_dreg_to_num(dm)?;
1250        let dd_num = vfp_dreg_to_num(dd)?;
1251        let (vm, m) = encode_dreg(dm_num);
1252        let (vd, d) = encode_dreg(dd_num);
1253
1254        if mode == 0b11 {
1255            // Trunc (toward zero): VCVTR.S32.F64 — bit[7]=1, always truncates
1256            let vcvt_to_int = 0xEEBD0BC0 | (m << 5) | vm;
1257            bytes.extend_from_slice(&vcvt_to_int.to_le_bytes());
1258        } else {
1259            // ceil/floor/nearest: manipulate FPSCR rounding mode
1260            let rt: u32 = 12;
1261
1262            // VMRS R12, FPSCR
1263            let vmrs = 0xEEF10A10 | (rt << 12);
1264            bytes.extend_from_slice(&vmrs.to_le_bytes());
1265
1266            // BIC R12, R12, #(3 << 22)
1267            let bic = 0xE3CC0000 | (rt << 12) | (0x05 << 8) | 0x03;
1268            bytes.extend_from_slice(&bic.to_le_bytes());
1269
1270            // ORR R12, R12, #(mode << 22)
1271            if mode != 0 {
1272                let orr = 0xE38C0000 | (rt << 12) | (0x05 << 8) | (mode as u32);
1273                bytes.extend_from_slice(&orr.to_le_bytes());
1274            }
1275
1276            // VMSR FPSCR, R12
1277            let vmsr = 0xEEE10A10 | (rt << 12);
1278            bytes.extend_from_slice(&vmsr.to_le_bytes());
1279
1280            // VCVT.S32.F64 S0, Dm — non-R variant (bit[7]=0), uses FPSCR rmode
1281            let vcvt_to_int = 0xEEBD0B40 | (m << 5) | vm;
1282            bytes.extend_from_slice(&vcvt_to_int.to_le_bytes());
1283
1284            // Restore FPSCR
1285            bytes.extend_from_slice(&vmrs.to_le_bytes());
1286            bytes.extend_from_slice(&bic.to_le_bytes());
1287            bytes.extend_from_slice(&vmsr.to_le_bytes());
1288        }
1289
1290        // VCVT.F64.S32 Dd, S0 (convert back to double)
1291        let vcvt_to_float = 0xEEB80B40 | (d << 22) | (vd << 12);
1292        bytes.extend_from_slice(&vcvt_to_float.to_le_bytes());
1293
1294        Ok(bytes)
1295    }
1296
1297    /// Encode F64 min/max as ARM32: VMOV + VCMP + VMRS + conditional VMOV
1298    fn encode_arm_f64_minmax(
1299        &self,
1300        dd: &VfpReg,
1301        dn: &VfpReg,
1302        dm: &VfpReg,
1303        is_min: bool,
1304    ) -> Result<Vec<u8>> {
1305        let mut bytes = Vec::new();
1306        let dn_num = vfp_dreg_to_num(dn)?;
1307        let dm_num = vfp_dreg_to_num(dm)?;
1308        let dd_num = vfp_dreg_to_num(dd)?;
1309
1310        // VMOV.F64 Dd, Dn (start with first operand)
1311        let (vd, d) = encode_dreg(dd_num);
1312        let (vn, n) = encode_dreg(dn_num);
1313        let vmov_dn = 0xEEB00B40 | (d << 22) | (vd << 12) | (n << 5) | vn;
1314        bytes.extend_from_slice(&vmov_dn.to_le_bytes());
1315
1316        // VCMP.F64 Dn, Dm
1317        let (vm, m) = encode_dreg(dm_num);
1318        let vcmp = 0xEEB40B40 | (n << 22) | (vn << 12) | (m << 5) | vm;
1319        bytes.extend_from_slice(&vcmp.to_le_bytes());
1320
1321        // VMRS APSR_nzcv, FPSCR
1322        bytes.extend_from_slice(&0xEEF1FA10u32.to_le_bytes());
1323
1324        let cond = if is_min { 0xCu32 } else { 0x4u32 };
1325        let vmov_cond = (cond << 28) | 0x0EB00B40 | (d << 22) | (vd << 12) | (m << 5) | vm;
1326        bytes.extend_from_slice(&vmov_cond.to_le_bytes());
1327
1328        Ok(bytes)
1329    }
1330
1331    /// Encode F64 copysign as ARM32
1332    fn encode_arm_f64_copysign(&self, dd: &VfpReg, dn: &VfpReg, dm: &VfpReg) -> Result<Vec<u8>> {
1333        let mut bytes = Vec::new();
1334
1335        // VMOV R0, R12, Dm (get sign source bits)
1336        let vmov_dm = encode_vmov_core_dreg(false, dm, &Reg::R0, &Reg::R12)?;
1337        bytes.extend_from_slice(&vmov_dm.to_le_bytes());
1338
1339        // VMOV R1, R2, Dn (get magnitude source bits)
1340        // We use R1 (lo) and R2 (hi) for the magnitude
1341        let vmov_dn = encode_vmov_core_dreg(false, dn, &Reg::R1, &Reg::R2)?;
1342        bytes.extend_from_slice(&vmov_dn.to_le_bytes());
1343
1344        // AND R12, R12, #0x80000000 (keep only sign bit from hi word)
1345        let and_sign = 0xE2000000u32 | (12 << 16) | (12 << 12) | (1 << 8) | 0x02;
1346        bytes.extend_from_slice(&and_sign.to_le_bytes());
1347
1348        // BIC R2, R2, #0x80000000 (clear sign bit from magnitude hi word)
1349        let bic_sign = 0xE3C00000u32 | (2 << 16) | (2 << 12) | (1 << 8) | 0x02;
1350        bytes.extend_from_slice(&bic_sign.to_le_bytes());
1351
1352        // ORR R2, R2, R12 (combine sign + magnitude)
1353        let orr = 0xE1800000u32 | (2 << 16) | (2 << 12) | 12;
1354        bytes.extend_from_slice(&orr.to_le_bytes());
1355
1356        // VMOV Dd, R1, R2
1357        let vmov_result = encode_vmov_core_dreg(true, dd, &Reg::R1, &Reg::R2)?;
1358        bytes.extend_from_slice(&vmov_result.to_le_bytes());
1359
1360        Ok(bytes)
1361    }
1362
1363    /// Encode VCVT.S32/U32.F32 + VMOV as ARM32
1364    fn encode_arm_i32_trunc_f32(&self, rd: &Reg, sm: &VfpReg, signed: bool) -> Result<Vec<u8>> {
1365        let mut bytes = Vec::new();
1366
1367        // VCVT.S32.F32 Sd, Sm (toward zero) or VCVT.U32.F32 Sd, Sm
1368        // We use Sm as both source and destination for the intermediate result
1369        let sm_num = vfp_sreg_to_num(sm)?;
1370        let (vd, d) = encode_sreg(sm_num);
1371        let (vm, m) = encode_sreg(sm_num);
1372        let base = if signed { 0xEEBD0AC0 } else { 0xEEBC0AC0 };
1373        let vcvt = base | (d << 22) | (vd << 12) | (m << 5) | vm;
1374        bytes.extend_from_slice(&vcvt.to_le_bytes());
1375
1376        // VMOV Rd, Sm — move result back to core register
1377        let vmov = encode_vmov_core_sreg(false, sm, rd)?;
1378        bytes.extend_from_slice(&vmov.to_le_bytes());
1379
1380        Ok(bytes)
1381    }
1382
1383    /// Encode an ARM instruction in Thumb-2 mode (16-bit or 32-bit instructions)
1384    fn encode_thumb(&self, op: &ArmOp) -> Result<Vec<u8>> {
1385        // Thumb-2 supports both 16-bit and 32-bit instructions
1386        // 32-bit instructions are encoded as two 16-bit halfwords (big-endian order)
1387        match op {
1388            // === 16-bit Thumb encodings ===
1389            ArmOp::Add { rd, rn, op2 } => {
1390                let rd_bits = reg_to_bits(rd) as u16;
1391                let rn_bits = reg_to_bits(rn) as u16;
1392
1393                if let Operand2::Reg(rm) = op2 {
1394                    let rm_bits = reg_to_bits(rm) as u16;
1395                    // 16-bit ADDS only has 3-bit register fields (R0-R7). For
1396                    // high registers (e.g. R12, the MemLoad/MemStore base
1397                    // scratch) the bits overflow into adjacent fields, silently
1398                    // corrupting the operands — issue #178/#180: `add ip,ip,r0`
1399                    // was emitted as `adds r4,r5,r1`. Guard on all three regs
1400                    // being low and fall back to 32-bit ADD.W otherwise, exactly
1401                    // as the Sub handler below does.
1402                    if rd_bits < 8 && rn_bits < 8 && rm_bits < 8 {
1403                        // ADDS Rd, Rn, Rm (16-bit): 0001 100 Rm Rn Rd
1404                        let instr: u16 = 0x1800 | (rm_bits << 6) | (rn_bits << 3) | rd_bits;
1405                        Ok(instr.to_le_bytes().to_vec())
1406                    } else {
1407                        // ADD.W Rd, Rn, Rm (32-bit) for high registers
1408                        self.encode_thumb32_add_reg_raw(
1409                            rd_bits as u32,
1410                            rn_bits as u32,
1411                            rm_bits as u32,
1412                        )
1413                    }
1414                } else if let Operand2::Imm(imm) = op2 {
1415                    if *imm <= 7 && rd_bits < 8 && rn_bits < 8 {
1416                        // ADDS Rd, Rn, #imm3 (16-bit): 0001 110 imm3 Rn Rd
1417                        let instr: u16 = 0x1C00 | ((*imm as u16) << 6) | (rn_bits << 3) | rd_bits;
1418                        Ok(instr.to_le_bytes().to_vec())
1419                    } else {
1420                        // Use 32-bit ADD for larger immediates
1421                        self.encode_thumb32_add(rd, rn, *imm as u32)
1422                    }
1423                } else {
1424                    // Fallback to 32-bit encoding
1425                    self.encode_thumb32_add(rd, rn, 0)
1426                }
1427            }
1428
1429            ArmOp::Sub { rd, rn, op2 } => {
1430                let rd_bits = reg_to_bits(rd) as u16;
1431                let rn_bits = reg_to_bits(rn) as u16;
1432
1433                if let Operand2::Reg(rm) = op2 {
1434                    let rm_bits = reg_to_bits(rm) as u16;
1435                    // 16-bit SUBS can only use low registers (R0-R7)
1436                    if rd_bits < 8 && rn_bits < 8 && rm_bits < 8 {
1437                        // SUBS Rd, Rn, Rm (16-bit): 0001 101 Rm Rn Rd
1438                        let instr: u16 = 0x1A00 | (rm_bits << 6) | (rn_bits << 3) | rd_bits;
1439                        Ok(instr.to_le_bytes().to_vec())
1440                    } else {
1441                        // Use 32-bit SUB.W for high registers
1442                        self.encode_thumb32_sub_reg_raw(
1443                            rd_bits as u32,
1444                            rn_bits as u32,
1445                            rm_bits as u32,
1446                        )
1447                    }
1448                } else if let Operand2::Imm(imm) = op2 {
1449                    if *imm <= 7 && rd_bits < 8 && rn_bits < 8 {
1450                        // SUBS Rd, Rn, #imm3 (16-bit): 0001 111 imm3 Rn Rd
1451                        let instr: u16 = 0x1E00 | ((*imm as u16) << 6) | (rn_bits << 3) | rd_bits;
1452                        Ok(instr.to_le_bytes().to_vec())
1453                    } else {
1454                        self.encode_thumb32_sub(rd, rn, *imm as u32)
1455                    }
1456                } else {
1457                    self.encode_thumb32_sub(rd, rn, 0)
1458                }
1459            }
1460
1461            ArmOp::Mov { rd, op2 } => {
1462                let rd_bits = reg_to_bits(rd) as u16;
1463
1464                if let Operand2::Imm(imm) = op2 {
1465                    if *imm <= 255 && rd_bits < 8 {
1466                        // MOVS Rd, #imm8 (16-bit): 0010 0 Rd imm8
1467                        let imm_bits = (*imm as u16) & 0xFF;
1468                        let instr: u16 = 0x2000 | (rd_bits << 8) | imm_bits;
1469                        Ok(instr.to_le_bytes().to_vec())
1470                    } else {
1471                        // Use 32-bit MOVW for larger immediates
1472                        self.encode_thumb32_movw(rd, *imm as u32)
1473                    }
1474                } else if let Operand2::Reg(rm) = op2 {
1475                    let rm_bits = reg_to_bits(rm) as u16;
1476                    // MOV Rd, Rm (16-bit): 0100 0110 D Rm Rd[2:0]
1477                    // D = Rd[3], Rd[2:0] in lower bits
1478                    let d_bit = (rd_bits >> 3) & 1;
1479                    let instr: u16 = 0x4600 | (d_bit << 7) | (rm_bits << 3) | (rd_bits & 0x7);
1480                    Ok(instr.to_le_bytes().to_vec())
1481                } else {
1482                    let instr: u16 = 0xBF00; // NOP fallback
1483                    Ok(instr.to_le_bytes().to_vec())
1484                }
1485            }
1486
1487            ArmOp::Push { regs } => {
1488                // Thumb-2 PUSH encoding:
1489                // If all regs in R0-R7 + LR, use 16-bit: 1011 010 M rrrrrrrr
1490                // Otherwise use 32-bit: STMDB SP!, {regs} = 1110 1001 0010 1101 | 0M0 reglist(13)
1491                let mut reg_list: u16 = 0;
1492                let mut need_32bit = false;
1493                for r in regs {
1494                    let bit = reg_to_bits(r);
1495                    if bit >= 8 && *r != Reg::LR {
1496                        need_32bit = true;
1497                    }
1498                    reg_list |= 1 << bit;
1499                }
1500                if !need_32bit {
1501                    // 16-bit PUSH: 1011 010 M rrrrrrrr
1502                    let m_bit = if reg_list & (1 << 14) != 0 {
1503                        1u16
1504                    } else {
1505                        0u16
1506                    };
1507                    let low_regs = reg_list & 0xFF;
1508                    let instr: u16 = 0xB400 | (m_bit << 8) | low_regs;
1509                    Ok(instr.to_le_bytes().to_vec())
1510                } else {
1511                    // 32-bit STMDB SP!, {regs}: E92D | reglist(16)
1512                    let hw1: u16 = 0xE92D;
1513                    let hw2: u16 = reg_list;
1514                    let mut bytes = hw1.to_le_bytes().to_vec();
1515                    bytes.extend_from_slice(&hw2.to_le_bytes());
1516                    Ok(bytes)
1517                }
1518            }
1519
1520            ArmOp::Pop { regs } => {
1521                // Thumb-2 POP encoding:
1522                // If all regs in R0-R7 + PC, use 16-bit: 1011 110 P rrrrrrrr
1523                // Otherwise use 32-bit: LDMIA SP!, {regs} = 1110 1000 1011 1101 | PM0 reglist(13)
1524                let mut reg_list: u16 = 0;
1525                let mut need_32bit = false;
1526                for r in regs {
1527                    let bit = reg_to_bits(r);
1528                    if bit >= 8 && *r != Reg::PC {
1529                        need_32bit = true;
1530                    }
1531                    reg_list |= 1 << bit;
1532                }
1533                if !need_32bit {
1534                    // 16-bit POP: 1011 110 P rrrrrrrr
1535                    let p_bit = if reg_list & (1 << 15) != 0 {
1536                        1u16
1537                    } else {
1538                        0u16
1539                    };
1540                    let low_regs = reg_list & 0xFF;
1541                    let instr: u16 = 0xBC00 | (p_bit << 8) | low_regs;
1542                    Ok(instr.to_le_bytes().to_vec())
1543                } else {
1544                    // 32-bit LDMIA SP!, {regs}: E8BD | reglist(16)
1545                    let hw1: u16 = 0xE8BD;
1546                    let hw2: u16 = reg_list;
1547                    let mut bytes = hw1.to_le_bytes().to_vec();
1548                    bytes.extend_from_slice(&hw2.to_le_bytes());
1549                    Ok(bytes)
1550                }
1551            }
1552
1553            ArmOp::Nop => {
1554                let instr: u16 = 0xBF00; // NOP in Thumb-2
1555                Ok(instr.to_le_bytes().to_vec())
1556            }
1557
1558            ArmOp::Udf { imm } => {
1559                // UDF (Undefined) in Thumb-2: 16-bit encoding is 0xDE00 | imm8
1560                // This triggers UsageFault/HardFault, used for WASM traps
1561                let instr: u16 = 0xDE00 | (*imm as u16);
1562                let bytes = instr.to_le_bytes().to_vec();
1563                encoding_contracts::verify_thumb16(&bytes);
1564                Ok(bytes)
1565            }
1566
1567            // i64 support: ADDS, ADC, SUBS, SBC for register pair arithmetic
1568            // ADDS sets flags (carry), ADC uses carry from previous ADDS
1569            ArmOp::Adds { rd, rn, op2 } => {
1570                let rd_bits = reg_to_bits(rd) as u16;
1571                let rn_bits = reg_to_bits(rn) as u16;
1572
1573                if let Operand2::Reg(rm) = op2 {
1574                    let rm_bits = reg_to_bits(rm) as u16;
1575                    // 16-bit ADDS is R0-R7 only; i64 pair allocation can place
1576                    // operands in R8-R11, which would overflow the 3-bit fields
1577                    // and corrupt the operands (#178/#180 class). Guard and fall
1578                    // back to 32-bit ADDS.W for high registers.
1579                    if rd_bits < 8 && rn_bits < 8 && rm_bits < 8 {
1580                        // ADDS Rd, Rn, Rm (16-bit): 0001 100 Rm Rn Rd
1581                        let instr: u16 = 0x1800 | (rm_bits << 6) | (rn_bits << 3) | rd_bits;
1582                        Ok(instr.to_le_bytes().to_vec())
1583                    } else {
1584                        self.encode_thumb32_adds_reg_raw(
1585                            rd_bits as u32,
1586                            rn_bits as u32,
1587                            rm_bits as u32,
1588                        )
1589                    }
1590                } else {
1591                    // 32-bit Thumb-2 ADDS with immediate
1592                    self.encode_thumb32_adds(rd, rn, 0)
1593                }
1594            }
1595
1596            // ADC: Add with Carry (Thumb-2 32-bit)
1597            // ADC.W Rd, Rn, Rm: EB40 Rn | 00 Rd 00 Rm
1598            ArmOp::Adc { rd, rn, op2 } => {
1599                let rd_bits = reg_to_bits(rd);
1600                let rn_bits = reg_to_bits(rn);
1601
1602                if let Operand2::Reg(rm) = op2 {
1603                    let rm_bits = reg_to_bits(rm);
1604                    // ADC.W Rd, Rn, Rm (T2): 1110 1011 0100 Rn | 0 000 Rd 00 00 Rm
1605                    let hw1: u16 = (0xEB40 | rn_bits) as u16;
1606                    let hw2: u16 = ((rd_bits << 8) | rm_bits) as u16;
1607
1608                    let mut bytes = hw1.to_le_bytes().to_vec();
1609                    bytes.extend_from_slice(&hw2.to_le_bytes());
1610                    Ok(bytes)
1611                } else {
1612                    // ADC with immediate - use 32-bit encoding
1613                    let hw1: u16 = (0xF140 | rn_bits) as u16;
1614                    let hw2: u16 = (rd_bits << 8) as u16;
1615                    let mut bytes = hw1.to_le_bytes().to_vec();
1616                    bytes.extend_from_slice(&hw2.to_le_bytes());
1617                    Ok(bytes)
1618                }
1619            }
1620
1621            // SUBS sets flags (borrow), SBC uses borrow from previous SUBS
1622            ArmOp::Subs { rd, rn, op2 } => {
1623                let rd_bits = reg_to_bits(rd) as u16;
1624                let rn_bits = reg_to_bits(rn) as u16;
1625
1626                if let Operand2::Reg(rm) = op2 {
1627                    let rm_bits = reg_to_bits(rm) as u16;
1628                    // 16-bit SUBS is R0-R7 only; high-register i64 pair operands
1629                    // would overflow the 3-bit fields (#178/#180 class). Guard
1630                    // and fall back to 32-bit SUBS.W for high registers.
1631                    if rd_bits < 8 && rn_bits < 8 && rm_bits < 8 {
1632                        // SUBS Rd, Rn, Rm (16-bit): 0001 101 Rm Rn Rd
1633                        let instr: u16 = 0x1A00 | (rm_bits << 6) | (rn_bits << 3) | rd_bits;
1634                        Ok(instr.to_le_bytes().to_vec())
1635                    } else {
1636                        self.encode_thumb32_subs_reg_raw(
1637                            rd_bits as u32,
1638                            rn_bits as u32,
1639                            rm_bits as u32,
1640                        )
1641                    }
1642                } else {
1643                    // 32-bit Thumb-2 SUBS with immediate
1644                    self.encode_thumb32_subs(rd, rn, 0)
1645                }
1646            }
1647
1648            // SBC: Subtract with Carry (Thumb-2 32-bit)
1649            // SBC.W Rd, Rn, Rm: EB60 Rn | 00 Rd 00 Rm
1650            ArmOp::Sbc { rd, rn, op2 } => {
1651                let rd_bits = reg_to_bits(rd);
1652                let rn_bits = reg_to_bits(rn);
1653
1654                if let Operand2::Reg(rm) = op2 {
1655                    let rm_bits = reg_to_bits(rm);
1656                    // SBC.W Rd, Rn, Rm (T2): 1110 1011 0110 Rn | 0 000 Rd 00 00 Rm
1657                    let hw1: u16 = (0xEB60 | rn_bits) as u16;
1658                    let hw2: u16 = ((rd_bits << 8) | rm_bits) as u16;
1659
1660                    let mut bytes = hw1.to_le_bytes().to_vec();
1661                    bytes.extend_from_slice(&hw2.to_le_bytes());
1662                    Ok(bytes)
1663                } else {
1664                    // SBC with immediate - use 32-bit encoding
1665                    let hw1: u16 = (0xF160 | rn_bits) as u16;
1666                    let hw2: u16 = (rd_bits << 8) as u16;
1667                    let mut bytes = hw1.to_le_bytes().to_vec();
1668                    bytes.extend_from_slice(&hw2.to_le_bytes());
1669                    Ok(bytes)
1670                }
1671            }
1672
1673            // === 32-bit Thumb-2 encodings ===
1674
1675            // SDIV: 11111011 1001 Rn 1111 Rd 1111 Rm
1676            ArmOp::Sdiv { rd, rn, rm } => {
1677                let rd_bits = reg_to_bits(rd);
1678                let rn_bits = reg_to_bits(rn);
1679                let rm_bits = reg_to_bits(rm);
1680                reg_bits_checked(rd_bits)?;
1681                reg_bits_checked(rn_bits)?;
1682                reg_bits_checked(rm_bits)?;
1683
1684                // Thumb-2 SDIV: FB90 F0F0 | Rn<<16 | Rd<<8 | Rm
1685                // First halfword: 1111 1011 1001 Rn = 0xFB90 | Rn
1686                // Second halfword: 1111 Rd 1111 Rm = 0xF0F0 | Rd<<8 | Rm
1687                let hw1: u16 = (0xFB90 | rn_bits) as u16;
1688                let hw2: u16 = (0xF0F0 | (rd_bits << 8) | rm_bits) as u16;
1689
1690                // Thumb-2 32-bit instructions: first halfword, then second halfword (little-endian each)
1691                let mut bytes = hw1.to_le_bytes().to_vec();
1692                bytes.extend_from_slice(&hw2.to_le_bytes());
1693                encoding_contracts::verify_thumb32(&bytes);
1694                Ok(bytes)
1695            }
1696
1697            // UDIV: 11111011 1011 Rn 1111 Rd 1111 Rm
1698            ArmOp::Udiv { rd, rn, rm } => {
1699                let rd_bits = reg_to_bits(rd);
1700                let rn_bits = reg_to_bits(rn);
1701                let rm_bits = reg_to_bits(rm);
1702                reg_bits_checked(rd_bits)?;
1703                reg_bits_checked(rn_bits)?;
1704                reg_bits_checked(rm_bits)?;
1705
1706                // Thumb-2 UDIV: FBB0 F0F0 | Rn<<16 | Rd<<8 | Rm
1707                let hw1: u16 = (0xFBB0 | rn_bits) as u16;
1708                let hw2: u16 = (0xF0F0 | (rd_bits << 8) | rm_bits) as u16;
1709
1710                let mut bytes = hw1.to_le_bytes().to_vec();
1711                bytes.extend_from_slice(&hw2.to_le_bytes());
1712                encoding_contracts::verify_thumb32(&bytes);
1713                Ok(bytes)
1714            }
1715
1716            // MUL (Thumb-2 32-bit): MUL Rd, Rn, Rm
1717            ArmOp::Mul { rd, rn, rm } => {
1718                let rd_bits = reg_to_bits(rd);
1719                let rn_bits = reg_to_bits(rn);
1720                let rm_bits = reg_to_bits(rm);
1721
1722                // Thumb-2 MUL: FB00 F000 | Rn | Rd<<8 | Rm
1723                // 11111011 0000 Rn | 1111 Rd 0000 Rm
1724                let hw1: u16 = (0xFB00 | rn_bits) as u16;
1725                let hw2: u16 = (0xF000 | (rd_bits << 8) | rm_bits) as u16;
1726
1727                let mut bytes = hw1.to_le_bytes().to_vec();
1728                bytes.extend_from_slice(&hw2.to_le_bytes());
1729                Ok(bytes)
1730            }
1731
1732            // MLS: Rd = Ra - Rn * Rm
1733            ArmOp::Mls { rd, rn, rm, ra } => {
1734                let rd_bits = reg_to_bits(rd);
1735                let rn_bits = reg_to_bits(rn);
1736                let rm_bits = reg_to_bits(rm);
1737                let ra_bits = reg_to_bits(ra);
1738
1739                // Thumb-2 MLS: FB00 Rn | Ra Rd 0001 Rm
1740                // 11111011 0000 Rn | Ra Rd 0001 Rm
1741                let hw1: u16 = (0xFB00 | rn_bits) as u16;
1742                let hw2: u16 = ((ra_bits << 12) | (rd_bits << 8) | 0x10 | rm_bits) as u16;
1743
1744                let mut bytes = hw1.to_le_bytes().to_vec();
1745                bytes.extend_from_slice(&hw2.to_le_bytes());
1746                Ok(bytes)
1747            }
1748
1749            // AND (Thumb-2 32-bit)
1750            ArmOp::And { rd, rn, op2 } => {
1751                if let Operand2::Reg(rm) = op2 {
1752                    let rd_bits = reg_to_bits(rd);
1753                    let rn_bits = reg_to_bits(rn);
1754                    let rm_bits = reg_to_bits(rm);
1755
1756                    // Thumb-2 AND register: EA00 Rn | 0 Rd 00 00 Rm
1757                    let hw1: u16 = (0xEA00 | rn_bits) as u16;
1758                    let hw2: u16 = ((rd_bits << 8) | rm_bits) as u16;
1759
1760                    let mut bytes = hw1.to_le_bytes().to_vec();
1761                    bytes.extend_from_slice(&hw2.to_le_bytes());
1762                    Ok(bytes)
1763                } else if let Operand2::Imm(imm) = op2 {
1764                    let rd_bits = reg_to_bits(rd);
1765                    let rn_bits = reg_to_bits(rn);
1766                    let imm_val = *imm as u32;
1767
1768                    // Thumb-2 AND.W immediate T1: 11110 i 0 0000 S Rn | 0 imm3 Rd imm8
1769                    let i_bit = (imm_val >> 11) & 1;
1770                    let imm3 = (imm_val >> 8) & 0x7;
1771                    let imm8 = imm_val & 0xFF;
1772
1773                    let hw1: u16 = (0xF000 | (i_bit << 10) | rn_bits) as u16;
1774                    let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
1775
1776                    let mut bytes = hw1.to_le_bytes().to_vec();
1777                    bytes.extend_from_slice(&hw2.to_le_bytes());
1778                    Ok(bytes)
1779                } else {
1780                    // RegShift variant - fallback to NOP
1781                    let instr: u16 = 0xBF00;
1782                    Ok(instr.to_le_bytes().to_vec())
1783                }
1784            }
1785
1786            // ORR (Thumb-2 32-bit)
1787            ArmOp::Orr { rd, rn, op2 } => {
1788                if let Operand2::Reg(rm) = op2 {
1789                    let rd_bits = reg_to_bits(rd);
1790                    let rn_bits = reg_to_bits(rn);
1791                    let rm_bits = reg_to_bits(rm);
1792
1793                    // Thumb-2 ORR: EA40 Rn | 0 Rd 00 00 Rm
1794                    let hw1: u16 = (0xEA40 | rn_bits) as u16;
1795                    let hw2: u16 = ((rd_bits << 8) | rm_bits) as u16;
1796
1797                    let mut bytes = hw1.to_le_bytes().to_vec();
1798                    bytes.extend_from_slice(&hw2.to_le_bytes());
1799                    Ok(bytes)
1800                } else {
1801                    let instr: u16 = 0xBF00;
1802                    Ok(instr.to_le_bytes().to_vec())
1803                }
1804            }
1805
1806            // EOR (Thumb-2 32-bit)
1807            ArmOp::Eor { rd, rn, op2 } => {
1808                if let Operand2::Reg(rm) = op2 {
1809                    let rd_bits = reg_to_bits(rd);
1810                    let rn_bits = reg_to_bits(rn);
1811                    let rm_bits = reg_to_bits(rm);
1812
1813                    // Thumb-2 EOR: EA80 Rn | 0 Rd 00 00 Rm
1814                    let hw1: u16 = (0xEA80 | rn_bits) as u16;
1815                    let hw2: u16 = ((rd_bits << 8) | rm_bits) as u16;
1816
1817                    let mut bytes = hw1.to_le_bytes().to_vec();
1818                    bytes.extend_from_slice(&hw2.to_le_bytes());
1819                    Ok(bytes)
1820                } else {
1821                    let instr: u16 = 0xBF00;
1822                    Ok(instr.to_le_bytes().to_vec())
1823                }
1824            }
1825
1826            // Shift operations (16-bit for low registers)
1827            ArmOp::Lsl { rd, rn, shift } => {
1828                let rd_bits = reg_to_bits(rd) as u16;
1829                let rn_bits = reg_to_bits(rn) as u16;
1830                let shift_bits = (*shift as u16) & 0x1F;
1831
1832                if rd_bits < 8 && rn_bits < 8 {
1833                    // LSLS Rd, Rm, #imm5 (16-bit): 0000 0 imm5 Rm Rd
1834                    let instr: u16 = (shift_bits << 6) | (rn_bits << 3) | rd_bits;
1835                    Ok(instr.to_le_bytes().to_vec())
1836                } else {
1837                    // Use 32-bit encoding for high registers
1838                    self.encode_thumb32_shift(rd, rn, *shift, 0b00) // LSL type
1839                }
1840            }
1841
1842            ArmOp::Lsr { rd, rn, shift } => {
1843                let rd_bits = reg_to_bits(rd) as u16;
1844                let rn_bits = reg_to_bits(rn) as u16;
1845                let shift_bits = (*shift as u16) & 0x1F;
1846
1847                if rd_bits < 8 && rn_bits < 8 && shift_bits > 0 {
1848                    // LSRS Rd, Rm, #imm5 (16-bit): 0000 1 imm5 Rm Rd
1849                    let instr: u16 = 0x0800 | (shift_bits << 6) | (rn_bits << 3) | rd_bits;
1850                    Ok(instr.to_le_bytes().to_vec())
1851                } else {
1852                    self.encode_thumb32_shift(rd, rn, *shift, 0b01) // LSR type
1853                }
1854            }
1855
1856            ArmOp::Asr { rd, rn, shift } => {
1857                let rd_bits = reg_to_bits(rd) as u16;
1858                let rn_bits = reg_to_bits(rn) as u16;
1859                let shift_bits = (*shift as u16) & 0x1F;
1860
1861                if rd_bits < 8 && rn_bits < 8 && shift_bits > 0 {
1862                    // ASRS Rd, Rm, #imm5 (16-bit): 0001 0 imm5 Rm Rd
1863                    let instr: u16 = 0x1000 | (shift_bits << 6) | (rn_bits << 3) | rd_bits;
1864                    Ok(instr.to_le_bytes().to_vec())
1865                } else {
1866                    self.encode_thumb32_shift(rd, rn, *shift, 0b10) // ASR type
1867                }
1868            }
1869
1870            ArmOp::Ror { rd, rn, shift } => {
1871                // ROR doesn't have a 16-bit immediate form, use 32-bit
1872                self.encode_thumb32_shift(rd, rn, *shift, 0b11) // ROR type
1873            }
1874
1875            // Register-based shifts (Thumb-2 32-bit)
1876            // Encoding: 11111010 0xxS Rn 1111 Rd 0000 Rm
1877            // xx = shift type: 00=LSL, 01=LSR, 10=ASR, 11=ROR
1878            ArmOp::LslReg { rd, rn, rm } => self.encode_thumb32_shift_reg(rd, rn, rm, 0b00),
1879            ArmOp::LsrReg { rd, rn, rm } => self.encode_thumb32_shift_reg(rd, rn, rm, 0b01),
1880            ArmOp::AsrReg { rd, rn, rm } => self.encode_thumb32_shift_reg(rd, rn, rm, 0b10),
1881            ArmOp::RorReg { rd, rn, rm } => self.encode_thumb32_shift_reg(rd, rn, rm, 0b11),
1882
1883            // RSB (Reverse Subtract): Rd = imm - Rn
1884            // Thumb-2 T2 encoding: 11110 i 0 1110 S Rn | 0 imm3 Rd imm8
1885            ArmOp::Rsb { rd, rn, imm } => {
1886                let rd_bits = reg_to_bits(rd);
1887                let rn_bits = reg_to_bits(rn);
1888                let imm_val = *imm;
1889
1890                let i_bit = (imm_val >> 11) & 1;
1891                let imm3 = (imm_val >> 8) & 0x7;
1892                let imm8 = imm_val & 0xFF;
1893
1894                // hw1: 11110 i 01110 0 Rn  (S=0)
1895                let hw1: u16 = (0xF1C0 | (i_bit << 10) | rn_bits) as u16;
1896                // hw2: 0 imm3 Rd imm8
1897                let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
1898
1899                let mut bytes = hw1.to_le_bytes().to_vec();
1900                bytes.extend_from_slice(&hw2.to_le_bytes());
1901                Ok(bytes)
1902            }
1903
1904            // CLZ (Thumb-2 32-bit)
1905            ArmOp::Clz { rd, rm } => {
1906                let rd_bits = reg_to_bits(rd);
1907                let rm_bits = reg_to_bits(rm);
1908
1909                // Thumb-2 CLZ: FAB0 Rm | F8 Rd Rm
1910                // 11111010 1011 Rm | 1111 1000 Rd Rm
1911                let hw1: u16 = (0xFAB0 | rm_bits) as u16;
1912                let hw2: u16 = (0xF080 | (rd_bits << 8) | rm_bits) as u16;
1913
1914                let mut bytes = hw1.to_le_bytes().to_vec();
1915                bytes.extend_from_slice(&hw2.to_le_bytes());
1916                Ok(bytes)
1917            }
1918
1919            // RBIT (Thumb-2 32-bit)
1920            ArmOp::Rbit { rd, rm } => {
1921                let rd_bits = reg_to_bits(rd);
1922                let rm_bits = reg_to_bits(rm);
1923
1924                // Thumb-2 RBIT: FA90 Rm | F0 Rd A0 Rm
1925                // 11111010 1001 Rm | 1111 Rd 1010 Rm
1926                let hw1: u16 = (0xFA90 | rm_bits) as u16;
1927                let hw2: u16 = (0xF0A0 | (rd_bits << 8) | rm_bits) as u16;
1928
1929                let mut bytes = hw1.to_le_bytes().to_vec();
1930                bytes.extend_from_slice(&hw2.to_le_bytes());
1931                Ok(bytes)
1932            }
1933
1934            // SXTB (16-bit for low registers)
1935            ArmOp::Sxtb { rd, rm } => {
1936                let rd_bits = reg_to_bits(rd) as u16;
1937                let rm_bits = reg_to_bits(rm) as u16;
1938
1939                if rd_bits < 8 && rm_bits < 8 {
1940                    // SXTB Rd, Rm (16-bit): 1011 0010 01 Rm Rd
1941                    let instr: u16 = 0xB240 | (rm_bits << 3) | rd_bits;
1942                    Ok(instr.to_le_bytes().to_vec())
1943                } else {
1944                    // Thumb-2 SXTB.W: FA4F F(rd)80 (rm)
1945                    // 11111010 0100 1111 | 1111 Rd 10 rotate Rm
1946                    let rd_bits32 = rd_bits as u32;
1947                    let rm_bits32 = rm_bits as u32;
1948                    let hw1: u16 = 0xFA4F;
1949                    let hw2: u16 = (0xF080 | (rd_bits32 << 8) | rm_bits32) as u16;
1950                    let mut bytes = hw1.to_le_bytes().to_vec();
1951                    bytes.extend_from_slice(&hw2.to_le_bytes());
1952                    Ok(bytes)
1953                }
1954            }
1955
1956            // SXTH (16-bit for low registers)
1957            ArmOp::Sxth { rd, rm } => {
1958                let rd_bits = reg_to_bits(rd) as u16;
1959                let rm_bits = reg_to_bits(rm) as u16;
1960
1961                if rd_bits < 8 && rm_bits < 8 {
1962                    // SXTH Rd, Rm (16-bit): 1011 0010 00 Rm Rd
1963                    let instr: u16 = 0xB200 | (rm_bits << 3) | rd_bits;
1964                    Ok(instr.to_le_bytes().to_vec())
1965                } else {
1966                    // Thumb-2 SXTH.W: FA0F F(rd)80 (rm)
1967                    // 11111010 0000 1111 | 1111 Rd 10 rotate Rm
1968                    let rd_bits32 = rd_bits as u32;
1969                    let rm_bits32 = rm_bits as u32;
1970                    let hw1: u16 = 0xFA0F;
1971                    let hw2: u16 = (0xF080 | (rd_bits32 << 8) | rm_bits32) as u16;
1972                    let mut bytes = hw1.to_le_bytes().to_vec();
1973                    bytes.extend_from_slice(&hw2.to_le_bytes());
1974                    Ok(bytes)
1975                }
1976            }
1977
1978            // CMP (can be 16-bit for low registers)
1979            ArmOp::Cmp { rn, op2 } => {
1980                let rn_bits = reg_to_bits(rn) as u16;
1981
1982                if let Operand2::Imm(imm) = op2 {
1983                    // Only use 16-bit encoding for non-negative immediates 0-255
1984                    // Negative immediates must use 32-bit encoding
1985                    if *imm >= 0 && *imm <= 255 && rn_bits < 8 {
1986                        // CMP Rn, #imm8 (16-bit): 0010 1 Rn imm8
1987                        let instr: u16 = 0x2800 | (rn_bits << 8) | (*imm as u16 & 0xFF);
1988                        Ok(instr.to_le_bytes().to_vec())
1989                    } else {
1990                        self.encode_thumb32_cmp_imm(rn, *imm as u32)
1991                    }
1992                } else if let Operand2::Reg(rm) = op2 {
1993                    let rm_bits = reg_to_bits(rm) as u16;
1994                    if rn_bits < 8 && rm_bits < 8 {
1995                        // CMP Rn, Rm (16-bit low): 0100 0010 10 Rm Rn
1996                        let instr: u16 = 0x4280 | (rm_bits << 3) | rn_bits;
1997                        Ok(instr.to_le_bytes().to_vec())
1998                    } else {
1999                        // CMP Rn, Rm (16-bit high): 0100 0101 N Rm Rn[2:0]
2000                        let n_bit = (rn_bits >> 3) & 1;
2001                        let instr: u16 = 0x4500 | (n_bit << 7) | (rm_bits << 3) | (rn_bits & 0x7);
2002                        Ok(instr.to_le_bytes().to_vec())
2003                    }
2004                } else {
2005                    let instr: u16 = 0xBF00;
2006                    Ok(instr.to_le_bytes().to_vec())
2007                }
2008            }
2009
2010            // CMN (Compare Negative) - computes Rn + op2 and sets flags
2011            // CMN Rn, #1 sets Z flag if Rn == -1 (since -1 + 1 = 0)
2012            ArmOp::Cmn { rn, op2 } => {
2013                let rn_bits = reg_to_bits(rn) as u16;
2014
2015                if let Operand2::Imm(imm) = op2 {
2016                    // CMN.W Rn, #imm (32-bit encoding)
2017                    // Encoding: F110 Rn | 0F00 imm8 (for small immediates 0-255)
2018                    if *imm >= 0 && *imm <= 255 {
2019                        let imm8 = *imm as u16 & 0xFF;
2020                        let hw1: u16 = 0xF110 | rn_bits;
2021                        let hw2: u16 = 0x0F00 | imm8;
2022                        let mut bytes = hw1.to_le_bytes().to_vec();
2023                        bytes.extend_from_slice(&hw2.to_le_bytes());
2024                        Ok(bytes)
2025                    } else {
2026                        // For other immediates, fallback to NOP (should not happen in our use case)
2027                        Ok(vec![0xBF, 0x00])
2028                    }
2029                } else if let Operand2::Reg(rm) = op2 {
2030                    let rm_bits = reg_to_bits(rm) as u16;
2031                    // 16-bit CMN (T1) only encodes R0-R7; high registers overflow
2032                    // the 3-bit fields and corrupt the operands (#184, the #180
2033                    // class). CMN has no high-register 16-bit form, so fall back
2034                    // to 32-bit CMN.W (T2): EB10 Rn | 0F00 Rm (ADD.W with S=1 and
2035                    // Rd discarded as PC/1111).
2036                    if rn_bits < 8 && rm_bits < 8 {
2037                        // CMN Rn, Rm (16-bit): 0100 0010 11 Rm Rn
2038                        let instr: u16 = 0x42C0 | (rm_bits << 3) | rn_bits;
2039                        Ok(instr.to_le_bytes().to_vec())
2040                    } else {
2041                        let hw1: u16 = 0xEB10 | rn_bits;
2042                        let hw2: u16 = 0x0F00 | rm_bits;
2043                        let mut bytes = hw1.to_le_bytes().to_vec();
2044                        bytes.extend_from_slice(&hw2.to_le_bytes());
2045                        Ok(bytes)
2046                    }
2047                } else {
2048                    Ok(vec![0xBF, 0x00])
2049                }
2050            }
2051
2052            // LDR (can be 16-bit for simple cases)
2053            ArmOp::Ldr { rd, addr } => {
2054                let rd_bits = reg_to_bits(rd);
2055                let base_bits = reg_to_bits(&addr.base);
2056
2057                // Handle register offset mode [base, Roff] or [base, Roff, #imm]
2058                if let Some(offset_reg) = &addr.offset_reg {
2059                    let rm_bits = reg_to_bits(offset_reg);
2060
2061                    // If there's also an immediate offset, we need to ADD it first
2062                    if addr.offset != 0 {
2063                        // Use R12 (IP) as scratch to avoid clobbering the address register
2064                        // ADD R12, Rm, #offset; LDR Rd, [base, R12]
2065                        let scratch = Reg::R12;
2066                        let mut bytes =
2067                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2068                        bytes.extend(self.encode_thumb32_ldr_reg(rd, &addr.base, &scratch)?);
2069                        return Ok(bytes);
2070                    }
2071
2072                    // Simple register offset: LDR Rd, [Rn, Rm]
2073                    // 16-bit: only if Rd, Rn, Rm < R8
2074                    if rd_bits < 8 && base_bits < 8 && rm_bits < 8 {
2075                        // LDR Rd, [Rn, Rm] (16-bit): 0101 100 Rm Rn Rd
2076                        let instr: u16 = 0x5800
2077                            | ((rm_bits as u16) << 6)
2078                            | ((base_bits as u16) << 3)
2079                            | (rd_bits as u16);
2080                        return Ok(instr.to_le_bytes().to_vec());
2081                    }
2082
2083                    // 32-bit register offset
2084                    return self.encode_thumb32_ldr_reg(rd, &addr.base, offset_reg);
2085                }
2086
2087                // Immediate offset mode [base, #imm]
2088                let offset = addr.offset as u32;
2089
2090                if rd_bits < 8 && base_bits < 8 && (offset & 0x3) == 0 && offset <= 124 {
2091                    // LDR Rd, [Rn, #imm5*4] (16-bit): 0110 1 imm5 Rn Rd
2092                    let imm5 = (offset >> 2) as u16;
2093                    let instr: u16 =
2094                        0x6800 | (imm5 << 6) | ((base_bits as u16) << 3) | (rd_bits as u16);
2095                    Ok(instr.to_le_bytes().to_vec())
2096                } else {
2097                    self.encode_thumb32_ldr(rd, &addr.base, offset)
2098                }
2099            }
2100
2101            // STR (can be 16-bit for simple cases)
2102            ArmOp::Str { rd, addr } => {
2103                let rd_bits = reg_to_bits(rd);
2104                let base_bits = reg_to_bits(&addr.base);
2105
2106                // Handle register offset mode [base, Roff] or [base, Roff, #imm]
2107                if let Some(offset_reg) = &addr.offset_reg {
2108                    let rm_bits = reg_to_bits(offset_reg);
2109
2110                    // If there's also an immediate offset, we need to ADD it first
2111                    if addr.offset != 0 {
2112                        // Use R12 (IP) as scratch to avoid clobbering the address register
2113                        // ADD R12, Rm, #offset; STR Rd, [base, R12]
2114                        let scratch = Reg::R12;
2115                        let mut bytes =
2116                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2117                        bytes.extend(self.encode_thumb32_str_reg(rd, &addr.base, &scratch)?);
2118                        return Ok(bytes);
2119                    }
2120
2121                    // Simple register offset: STR Rd, [Rn, Rm]
2122                    // 16-bit: only if Rd, Rn, Rm < R8
2123                    if rd_bits < 8 && base_bits < 8 && rm_bits < 8 {
2124                        // STR Rd, [Rn, Rm] (16-bit): 0101 000 Rm Rn Rd
2125                        let instr: u16 = 0x5000
2126                            | ((rm_bits as u16) << 6)
2127                            | ((base_bits as u16) << 3)
2128                            | (rd_bits as u16);
2129                        return Ok(instr.to_le_bytes().to_vec());
2130                    }
2131
2132                    // 32-bit register offset
2133                    return self.encode_thumb32_str_reg(rd, &addr.base, offset_reg);
2134                }
2135
2136                // Immediate offset mode [base, #imm]
2137                let offset = addr.offset as u32;
2138
2139                if rd_bits < 8 && base_bits < 8 && (offset & 0x3) == 0 && offset <= 124 {
2140                    // STR Rd, [Rn, #imm5*4] (16-bit): 0110 0 imm5 Rn Rd
2141                    let imm5 = (offset >> 2) as u16;
2142                    let instr: u16 =
2143                        0x6000 | (imm5 << 6) | ((base_bits as u16) << 3) | (rd_bits as u16);
2144                    Ok(instr.to_le_bytes().to_vec())
2145                } else {
2146                    self.encode_thumb32_str(rd, &addr.base, offset)
2147                }
2148            }
2149
2150            // LDRB (Thumb-2)
2151            ArmOp::Ldrb { rd, addr } => {
2152                let rd_bits = reg_to_bits(rd);
2153                let base_bits = reg_to_bits(&addr.base);
2154
2155                if let Some(offset_reg) = &addr.offset_reg {
2156                    if addr.offset != 0 {
2157                        let scratch = Reg::R12;
2158                        let mut bytes =
2159                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2160                        bytes.extend(self.encode_thumb32_ldrb_reg(rd, &addr.base, &scratch)?);
2161                        return Ok(bytes);
2162                    }
2163                    return self.encode_thumb32_ldrb_reg(rd, &addr.base, offset_reg);
2164                }
2165
2166                let offset = addr.offset as u32;
2167                if rd_bits < 8 && base_bits < 8 && offset <= 31 {
2168                    // LDRB Rd, [Rn, #imm5] (16-bit): 0111 1 imm5 Rn Rd
2169                    let instr: u16 = 0x7800
2170                        | ((offset as u16) << 6)
2171                        | ((base_bits as u16) << 3)
2172                        | (rd_bits as u16);
2173                    Ok(instr.to_le_bytes().to_vec())
2174                } else {
2175                    self.encode_thumb32_ldrb_imm(rd, &addr.base, offset)
2176                }
2177            }
2178
2179            // LDRSB (Thumb-2)
2180            ArmOp::Ldrsb { rd, addr } => {
2181                let rd_bits = reg_to_bits(rd);
2182                let base_bits = reg_to_bits(&addr.base);
2183
2184                if let Some(offset_reg) = &addr.offset_reg {
2185                    if addr.offset != 0 {
2186                        let scratch = Reg::R12;
2187                        let mut bytes =
2188                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2189                        bytes.extend(self.encode_thumb32_ldrsb_reg(rd, &addr.base, &scratch)?);
2190                        return Ok(bytes);
2191                    }
2192                    return self.encode_thumb32_ldrsb_reg(rd, &addr.base, offset_reg);
2193                }
2194
2195                let offset = addr.offset as u32;
2196                // LDRSB has no 16-bit immediate form (only register)
2197                // For 16-bit reg form: only if Rd, Rn, Rm < R8
2198                if rd_bits < 8 && base_bits < 8 && offset == 0 {
2199                    // No immediate 16-bit encoding for LDRSB; use 32-bit
2200                    self.encode_thumb32_ldrsb_imm(rd, &addr.base, offset)
2201                } else {
2202                    self.encode_thumb32_ldrsb_imm(rd, &addr.base, offset)
2203                }
2204            }
2205
2206            // LDRH (Thumb-2)
2207            ArmOp::Ldrh { rd, addr } => {
2208                let rd_bits = reg_to_bits(rd);
2209                let base_bits = reg_to_bits(&addr.base);
2210
2211                if let Some(offset_reg) = &addr.offset_reg {
2212                    if addr.offset != 0 {
2213                        let scratch = Reg::R12;
2214                        let mut bytes =
2215                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2216                        bytes.extend(self.encode_thumb32_ldrh_reg(rd, &addr.base, &scratch)?);
2217                        return Ok(bytes);
2218                    }
2219                    return self.encode_thumb32_ldrh_reg(rd, &addr.base, offset_reg);
2220                }
2221
2222                let offset = addr.offset as u32;
2223                if rd_bits < 8 && base_bits < 8 && (offset & 0x1) == 0 && offset <= 62 {
2224                    // LDRH Rd, [Rn, #imm5*2] (16-bit): 1000 1 imm5 Rn Rd
2225                    let imm5 = (offset >> 1) as u16;
2226                    let instr: u16 =
2227                        0x8800 | (imm5 << 6) | ((base_bits as u16) << 3) | (rd_bits as u16);
2228                    Ok(instr.to_le_bytes().to_vec())
2229                } else {
2230                    self.encode_thumb32_ldrh_imm(rd, &addr.base, offset)
2231                }
2232            }
2233
2234            // LDRSH (Thumb-2)
2235            ArmOp::Ldrsh { rd, addr } => {
2236                if let Some(offset_reg) = &addr.offset_reg {
2237                    if addr.offset != 0 {
2238                        let scratch = Reg::R12;
2239                        let mut bytes =
2240                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2241                        bytes.extend(self.encode_thumb32_ldrsh_reg(rd, &addr.base, &scratch)?);
2242                        return Ok(bytes);
2243                    }
2244                    return self.encode_thumb32_ldrsh_reg(rd, &addr.base, offset_reg);
2245                }
2246
2247                let offset = addr.offset as u32;
2248                self.encode_thumb32_ldrsh_imm(rd, &addr.base, offset)
2249            }
2250
2251            // STRB (Thumb-2)
2252            ArmOp::Strb { rd, addr } => {
2253                let rd_bits = reg_to_bits(rd);
2254                let base_bits = reg_to_bits(&addr.base);
2255
2256                if let Some(offset_reg) = &addr.offset_reg {
2257                    if addr.offset != 0 {
2258                        let scratch = Reg::R12;
2259                        let mut bytes =
2260                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2261                        bytes.extend(self.encode_thumb32_strb_reg(rd, &addr.base, &scratch)?);
2262                        return Ok(bytes);
2263                    }
2264                    return self.encode_thumb32_strb_reg(rd, &addr.base, offset_reg);
2265                }
2266
2267                let offset = addr.offset as u32;
2268                if rd_bits < 8 && base_bits < 8 && offset <= 31 {
2269                    // STRB Rd, [Rn, #imm5] (16-bit): 0111 0 imm5 Rn Rd
2270                    let instr: u16 = 0x7000
2271                        | ((offset as u16) << 6)
2272                        | ((base_bits as u16) << 3)
2273                        | (rd_bits as u16);
2274                    Ok(instr.to_le_bytes().to_vec())
2275                } else {
2276                    self.encode_thumb32_strb_imm(rd, &addr.base, offset)
2277                }
2278            }
2279
2280            // STRH (Thumb-2)
2281            ArmOp::Strh { rd, addr } => {
2282                let rd_bits = reg_to_bits(rd);
2283                let base_bits = reg_to_bits(&addr.base);
2284
2285                if let Some(offset_reg) = &addr.offset_reg {
2286                    if addr.offset != 0 {
2287                        let scratch = Reg::R12;
2288                        let mut bytes =
2289                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2290                        bytes.extend(self.encode_thumb32_strh_reg(rd, &addr.base, &scratch)?);
2291                        return Ok(bytes);
2292                    }
2293                    return self.encode_thumb32_strh_reg(rd, &addr.base, offset_reg);
2294                }
2295
2296                let offset = addr.offset as u32;
2297                if rd_bits < 8 && base_bits < 8 && (offset & 0x1) == 0 && offset <= 62 {
2298                    // STRH Rd, [Rn, #imm5*2] (16-bit): 1000 0 imm5 Rn Rd
2299                    let imm5 = (offset >> 1) as u16;
2300                    let instr: u16 =
2301                        0x8000 | (imm5 << 6) | ((base_bits as u16) << 3) | (rd_bits as u16);
2302                    Ok(instr.to_le_bytes().to_vec())
2303                } else {
2304                    self.encode_thumb32_strh_imm(rd, &addr.base, offset)
2305                }
2306            }
2307
2308            // MemorySize (Thumb-2)
2309            ArmOp::MemorySize { rd } => {
2310                // LSR rd, R10, #16 — memory size in bytes / 65536 = pages
2311                // Thumb-2 16-bit: LSRS Rd, Rm, #imm5 — 0000 1 imm5 Rm Rd
2312                let rd_bits = reg_to_bits(rd);
2313                let r10_bits = reg_to_bits(&Reg::R10);
2314                if rd_bits < 8 && r10_bits < 8 {
2315                    let instr: u16 =
2316                        0x0800 | (16u16 << 6) | ((r10_bits as u16) << 3) | (rd_bits as u16);
2317                    Ok(instr.to_le_bytes().to_vec())
2318                } else {
2319                    // Thumb-2 32-bit LSR: 1110 1010 010 0 1111 | 0 imm3 Rd imm2 01 Rm
2320                    let imm5: u32 = 16;
2321                    let imm3 = (imm5 >> 2) & 0x7;
2322                    let imm2 = imm5 & 0x3;
2323                    let hw1: u16 = 0xEA4F;
2324                    let hw2: u16 =
2325                        ((imm3 << 12) | (rd_bits << 8) | (imm2 << 6) | 0x10 | r10_bits) as u16;
2326                    let mut bytes = hw1.to_le_bytes().to_vec();
2327                    bytes.extend_from_slice(&hw2.to_le_bytes());
2328                    Ok(bytes)
2329                }
2330            }
2331
2332            // MemoryGrow (Thumb-2)
2333            ArmOp::MemoryGrow { rd, .. } => {
2334                // On embedded with fixed memory, always return -1 (failure)
2335                // MVN rd, #0 → MOV rd, #-1
2336                // Thumb-2 32-bit: MVN: 1111 0 i 0 0 0 1 1 0 1111 | 0 imm3 Rd imm8
2337                let rd_bits = reg_to_bits(rd);
2338                let hw1: u16 = 0xF06F; // MVN with i=0
2339                let hw2: u16 = (rd_bits << 8) as u16; // imm8=0 → ~0 = 0xFFFFFFFF = -1
2340                let mut bytes = hw1.to_le_bytes().to_vec();
2341                bytes.extend_from_slice(&hw2.to_le_bytes());
2342                Ok(bytes)
2343            }
2344
2345            // BX (16-bit)
2346            ArmOp::Bx { rm } => {
2347                let rm_bits = reg_to_bits(rm) as u16;
2348                // BX Rm (16-bit): 0100 0111 0 Rm 000
2349                let instr: u16 = 0x4700 | (rm_bits << 3);
2350                Ok(instr.to_le_bytes().to_vec())
2351            }
2352
2353            // BLX (16-bit) - Branch with Link and Exchange
2354            // BLX Rm: 0100 0111 1 Rm 000
2355            ArmOp::Blx { rm } => {
2356                let rm_bits = reg_to_bits(rm) as u16;
2357                let instr: u16 = 0x4780 | (rm_bits << 3);
2358                Ok(instr.to_le_bytes().to_vec())
2359            }
2360
2361            // CallIndirect - indirect function call via table lookup
2362            // table_index_reg contains the table index
2363            // Generates: LSL R12, idx, #2; LDR R12, [R12, table_base]; BLX R12
2364            ArmOp::CallIndirect {
2365                rd: _,
2366                type_idx: _,
2367                table_index_reg,
2368            } => {
2369                let idx_reg = reg_to_bits(table_index_reg);
2370                let mut bytes = Vec::new();
2371
2372                // For now, we generate code that:
2373                // 1. Multiplies index by 4 (function pointer size)
2374                // 2. Loads function pointer from table (assumes table base in R11)
2375                // 3. Calls the function via BLX
2376                //
2377                // Table base setup must be done by caller/runtime.
2378                // This is a simplified implementation - full support needs:
2379                // - Table base address resolution
2380                // - Type signature checking
2381                // - Bounds checking
2382
2383                // LSL R12, idx_reg, #2 (multiply index by 4)
2384                // Thumb-2 MOV with shift: 11101010 010 S 1111 | 0 imm3 Rd imm2 type Rm
2385                // LSL: type=00, imm5=2 -> imm3=0, imm2=10
2386                let hw1: u16 = 0xEA4F_u16; // MOV.W R12, Rm, LSL #2
2387                let hw2: u16 = ((0x0C00 | (0b10 << 4)) | idx_reg) as u16;
2388                bytes.extend_from_slice(&hw1.to_le_bytes());
2389                bytes.extend_from_slice(&hw2.to_le_bytes());
2390
2391                // LDR R12, [R11, R12] - load function pointer
2392                // Thumb-2 LDR (register): 1111 1000 0101 Rn | Rt 0000 00 imm2 Rm
2393                // Rn=R11, Rt=R12, Rm=R12, imm2=00 (no shift)
2394                let ldr_hw1: u16 = 0xF85B; // LDR.W Rt, [R11, Rm]
2395                let ldr_hw2: u16 = 0xC00C; // Rt=R12, imm2=00, Rm=R12
2396                bytes.extend_from_slice(&ldr_hw1.to_le_bytes());
2397                bytes.extend_from_slice(&ldr_hw2.to_le_bytes());
2398
2399                // BLX R12 (call function indirectly)
2400                // BLX Rm (16-bit): 0100 0111 1 Rm 000
2401                let blx: u16 = 0x47E0; // BLX R12
2402                bytes.extend_from_slice(&blx.to_le_bytes());
2403
2404                Ok(bytes)
2405            }
2406
2407            // Label pseudo-instruction: emits no machine code
2408            ArmOp::Label { .. } => Ok(Vec::new()),
2409
2410            // Conditional branch to label (generic) - offset 0, will be patched
2411            ArmOp::Bcc { cond, label: _ } => {
2412                use synth_synthesis::Condition;
2413                let cond_bits: u16 = match cond {
2414                    Condition::EQ => 0x0,
2415                    Condition::NE => 0x1,
2416                    Condition::HS => 0x2,
2417                    Condition::LO => 0x3,
2418                    Condition::HI => 0x8,
2419                    Condition::LS => 0x9,
2420                    Condition::GE => 0xA,
2421                    Condition::LT => 0xB,
2422                    Condition::GT => 0xC,
2423                    Condition::LE => 0xD,
2424                };
2425                // 16-bit B<cond> with offset 0: 1101 cond imm8
2426                let instr: u16 = 0xD000 | (cond_bits << 8);
2427                Ok(instr.to_le_bytes().to_vec())
2428            }
2429
2430            // Branch instructions
2431            ArmOp::B { label: _ } => {
2432                // Simplified: B.N with offset 0
2433                // For real usage, would need label resolution
2434                let instr: u16 = 0xE000; // B.N #0
2435                Ok(instr.to_le_bytes().to_vec())
2436            }
2437
2438            // BHS (Branch if Higher or Same) - used for bounds checking
2439            // Condition code: 0x2 (C set)
2440            ArmOp::Bhs { label: _ } => {
2441                // 16-bit B<cond> with offset 0: 1101 cond imm8
2442                // cond = 0x2 (HS)
2443                let instr: u16 = 0xD200; // BHS.N #0
2444                Ok(instr.to_le_bytes().to_vec())
2445            }
2446
2447            // BLO (Branch if Lower) - complementary to BHS
2448            // Condition code: 0x3 (C clear)
2449            ArmOp::Blo { label: _ } => {
2450                // 16-bit B<cond> with offset 0: 1101 cond imm8
2451                // cond = 0x3 (LO)
2452                let instr: u16 = 0xD300; // BLO.N #0
2453                Ok(instr.to_le_bytes().to_vec())
2454            }
2455
2456            // Branch with numeric offset (Thumb-2)
2457            // Thumb-2 B.W instruction: 32-bit with +-16MB range
2458            ArmOp::BOffset { offset } => {
2459                // offset is already the halfword displacement: (target - branch - 4) / 2
2460                // This is the raw encoded value, accounting for variable-length instructions
2461                let halfword_offset = *offset;
2462
2463                // 16-bit B.N encoding: 1110 0 imm11 (11-bit signed halfword offset)
2464                // Range: -1024 to +1022 halfwords
2465                if (-1024..=1022).contains(&halfword_offset) {
2466                    // 16-bit B.N encoding: 1110 0 imm11
2467                    let imm11 = (halfword_offset as u16) & 0x7FF;
2468                    let instr: u16 = 0xE000 | imm11;
2469                    Ok(instr.to_le_bytes().to_vec())
2470                } else {
2471                    // 32-bit B.W encoding for larger offsets
2472                    // First halfword: 1111 0 S imm10
2473                    // Second halfword: 10 J1 0 J2 imm11
2474                    // Total offset = SignExtend(S:I1:I2:imm10:imm11:0)
2475                    // where I1 = NOT(J1 XOR S), I2 = NOT(J2 XOR S)
2476
2477                    // The B.W (T4) encoding packs the signed offset as:
2478                    //   S:I1:I2:imm10:imm11:0  (25-bit signed, halfword-aligned)
2479                    // where J1 = NOT(I1 XOR S), J2 = NOT(I2 XOR S)
2480                    // Input halfword_offset already equals (target - PC - 4) / 2,
2481                    // so the full byte offset = halfword_offset << 1.
2482                    // The encoding fields split that 25-bit signed value (including the
2483                    // implicit trailing zero) as: S | imm10 | imm11
2484                    // with I1 = bit 23 and I2 = bit 22 of the signed offset.
2485                    let signed_offset = halfword_offset << 1; // byte offset
2486                    let s = if signed_offset < 0 { 1u32 } else { 0u32 };
2487                    let uoffset = signed_offset as u32;
2488                    let imm10 = (uoffset >> 12) & 0x3FF; // bits [21:12]
2489                    let imm11 = (uoffset >> 1) & 0x7FF; // bits [11:1]
2490                    let i1 = (uoffset >> 23) & 1; // bit 23
2491                    let i2 = (uoffset >> 22) & 1; // bit 22
2492                    let j1 = (!(i1 ^ s)) & 1; // J1 = NOT(I1 XOR S)
2493                    let j2 = (!(i2 ^ s)) & 1; // J2 = NOT(I2 XOR S)
2494
2495                    let hw1: u16 = (0xF000 | (s << 10) | imm10) as u16;
2496                    let hw2: u16 = (0x9000 | (j1 << 13) | (j2 << 11) | imm11) as u16;
2497
2498                    let mut bytes = hw1.to_le_bytes().to_vec();
2499                    bytes.extend_from_slice(&hw2.to_le_bytes());
2500                    Ok(bytes)
2501                }
2502            }
2503
2504            // Conditional branch with numeric offset (Thumb-2)
2505            ArmOp::BCondOffset { cond, offset } => {
2506                use synth_synthesis::Condition;
2507                let cond_bits: u16 = match cond {
2508                    Condition::EQ => 0x0,
2509                    Condition::NE => 0x1,
2510                    Condition::HS => 0x2,
2511                    Condition::LO => 0x3,
2512                    Condition::HI => 0x8,
2513                    Condition::LS => 0x9,
2514                    Condition::GE => 0xA,
2515                    Condition::LT => 0xB,
2516                    Condition::GT => 0xC,
2517                    Condition::LE => 0xD,
2518                };
2519
2520                // offset is already the halfword displacement: (target - branch - 4) / 2
2521                // This is the raw imm8 value for 16-bit B<cond> encoding
2522                let halfword_offset = *offset;
2523
2524                // 16-bit B<cond> encoding: 1101 cond imm8
2525                // Range: -256 to +254 halfwords (imm8 is sign-extended and shifted left 1)
2526                if (-128..=127).contains(&halfword_offset) {
2527                    let imm8 = (halfword_offset as u16) & 0xFF;
2528                    let instr: u16 = 0xD000 | (cond_bits << 8) | imm8;
2529                    Ok(instr.to_le_bytes().to_vec())
2530                } else {
2531                    // 32-bit B<cond>.W for larger offsets
2532                    // First halfword: 1111 0 S cond imm6
2533                    // Second halfword: 10 J1 0 J2 imm11
2534                    let offset = halfword_offset >> 1;
2535                    let s = if offset < 0 { 1u32 } else { 0u32 };
2536                    let imm6 = ((offset >> 11) as u32) & 0x3F;
2537                    let imm11 = (offset as u32) & 0x7FF;
2538                    let j1 = if s == 1 { 1 } else { 0 };
2539                    let j2 = if s == 1 { 1 } else { 0 };
2540
2541                    let hw1: u16 = (0xF000 | (s << 10) | ((cond_bits as u32) << 6) | imm6) as u16;
2542                    let hw2: u16 = (0x8000 | (j1 << 13) | (j2 << 11) | imm11) as u16;
2543
2544                    let mut bytes = hw1.to_le_bytes().to_vec();
2545                    bytes.extend_from_slice(&hw2.to_le_bytes());
2546                    Ok(bytes)
2547                }
2548            }
2549
2550            ArmOp::Bl { label: _ } => {
2551                // BL is always 32-bit in Thumb-2, encoded here as a relocatable
2552                // placeholder; an R_ARM_THM_CALL relocation patches the target
2553                // (see arm_backend.rs). The placeholder must carry an embedded
2554                // addend of -4 so the relocation nets to exactly the symbol S.
2555                //
2556                // Thumb BL computes `target = (P + 4) + signed_offset`. Under
2557                // R_ARM_THM_CALL the linker resolves using the in-place addend;
2558                // a 0xF800 placeholder (addend 0) lands at S+4 — every call one
2559                // instruction past the callee entry (#174). The correct
2560                // placeholder is what `gas` emits for `bl <extern>`:
2561                //   f7ff fffe  ->  `bl <self>`  (S=1, J1=J2=1, imm = -4 addend),
2562                // i.e. hw1=0xF7FF, hw2=0xFFFE. This nets to S, not S+4.
2563                // (The earlier 0xD000 was worse still — a ~+0x600000 addend,
2564                // the garbage `bl c0000c` and "truncated to fit" of #167.)
2565                let hw1: u16 = 0xF7FF;
2566                let hw2: u16 = 0xFFFE;
2567                let mut bytes = hw1.to_le_bytes().to_vec();
2568                bytes.extend_from_slice(&hw2.to_le_bytes());
2569                Ok(bytes)
2570            }
2571
2572            // MVN
2573            ArmOp::Mvn { rd, op2 } => {
2574                if let Operand2::Reg(rm) = op2 {
2575                    let rd_bits = reg_to_bits(rd) as u16;
2576                    let rm_bits = reg_to_bits(rm) as u16;
2577
2578                    if rd_bits < 8 && rm_bits < 8 {
2579                        // MVNS Rd, Rm (16-bit): 0100 0011 11 Rm Rd
2580                        let instr: u16 = 0x43C0 | (rm_bits << 3) | rd_bits;
2581                        Ok(instr.to_le_bytes().to_vec())
2582                    } else {
2583                        // 32-bit MVN
2584                        let hw1: u16 = 0xEA6F_u16;
2585                        let hw2: u16 = ((reg_to_bits(rd) << 8) | reg_to_bits(rm)) as u16;
2586                        let mut bytes = hw1.to_le_bytes().to_vec();
2587                        bytes.extend_from_slice(&hw2.to_le_bytes());
2588                        Ok(bytes)
2589                    }
2590                } else {
2591                    let instr: u16 = 0xBF00;
2592                    Ok(instr.to_le_bytes().to_vec())
2593                }
2594            }
2595
2596            // MOVW - Move Wide (Thumb-2 32-bit)
2597            ArmOp::Movw { rd, imm16 } => {
2598                self.encode_thumb32_movw_raw(reg_to_bits(rd), *imm16 as u32)
2599            }
2600
2601            // MOVT - Move Top (Thumb-2 32-bit)
2602            ArmOp::Movt { rd, imm16 } => {
2603                self.encode_thumb32_movt_raw(reg_to_bits(rd), *imm16 as u32)
2604            }
2605
2606            // SetCond: Materialize condition flag into register (0 or 1)
2607            // Strategy: ITE <cond>; MOV Rd, #1; MOV Rd, #0
2608            // IMPORTANT: Must use ITE (If-Then-Else) because 16-bit Thumb MOV
2609            // always sets flags (MOVS). We need to evaluate the condition BEFORE
2610            // any MOV instruction clobbers the flags from CMP.
2611            ArmOp::SetCond { rd, cond } => {
2612                let rd_bits = reg_to_bits(rd) as u16;
2613
2614                // Condition code encoding for IT block
2615                use synth_synthesis::Condition;
2616                let cond_bits: u16 = match cond {
2617                    Condition::EQ => 0x0,
2618                    Condition::NE => 0x1,
2619                    Condition::LT => 0xB,
2620                    Condition::LE => 0xD,
2621                    Condition::GT => 0xC,
2622                    Condition::GE => 0xA,
2623                    Condition::LO => 0x3, // CC/LO (unsigned <)
2624                    Condition::LS => 0x9, // LS (unsigned <=)
2625                    Condition::HI => 0x8, // HI (unsigned >)
2626                    Condition::HS => 0x2, // CS/HS (unsigned >=)
2627                };
2628
2629                // ITE <cond>: encodes If-Then-Else block
2630                // The mask field depends on firstcond[0]:
2631                // - If firstcond[0] = 0: mask = 0xC for TE pattern (ITE EQ = BF0C)
2632                // - If firstcond[0] = 1: mask = 0x4 for TE pattern (ITE NE = BF14)
2633                let mask = if (cond_bits & 1) == 0 { 0xC } else { 0x4 };
2634                let ite_instr: u16 = 0xBF00 | (cond_bits << 4) | mask;
2635
2636                // Materialize 0/1 into Rd. The 16-bit MOVS (T1) encodes Rd in a
2637                // 3-bit field (bits[10:8]) — only R0–R7. For a high register
2638                // (R8–R12) `rd_bits << 8` overflows into bit 11 and silently
2639                // turns MOVS into CMP (00100 → 00101), corrupting the result
2640                // (this mis-materialized gale's `has_waiter`, so its `local.set`
2641                // stored a stale register → the binary-sem WAKE dispatch read
2642                // garbage). Use the 32-bit MOV.W (T2) for high registers, which
2643                // has a 4-bit Rd field. MOV.W with S=0 doesn't set flags, which
2644                // is fine inside the ITE (the materialized value is the result;
2645                // the flags are not consumed afterwards).
2646                let mut bytes = ite_instr.to_le_bytes().to_vec();
2647                let push_mov = |bytes: &mut Vec<u8>, imm: u16| {
2648                    if rd_bits <= 7 {
2649                        let m: u16 = 0x2000 | (rd_bits << 8) | imm; // 16-bit MOVS Rd,#imm
2650                        bytes.extend_from_slice(&m.to_le_bytes());
2651                    } else {
2652                        // 32-bit MOV.W Rd, #imm (T2): F04F | (Rd<<8) | imm8
2653                        let hw1: u16 = 0xF04F;
2654                        let hw2: u16 = (rd_bits << 8) | imm;
2655                        bytes.extend_from_slice(&hw1.to_le_bytes());
2656                        bytes.extend_from_slice(&hw2.to_le_bytes());
2657                    }
2658                };
2659                push_mov(&mut bytes, 1); // Then branch (condition true)  → 1
2660                push_mov(&mut bytes, 0); // Else branch (condition false) → 0
2661                Ok(bytes)
2662            }
2663
2664            // I64SetCond: Compare two i64 register pairs, result 0/1 in rd
2665            // EQ/NE: CMP lo,lo; IT EQ; CMPEQ hi,hi; ITE <cond>; MOV 1; MOV 0
2666            // LT: CMP lo,lo; SBCS rd,hi,hi; ITE LT; MOV 1; MOV 0
2667            // GT: CMP lo,lo (swapped); SBCS rd,hi,hi (swapped); ITE LT; MOV 1; MOV 0
2668            ArmOp::I64SetCond {
2669                rd,
2670                rn_lo,
2671                rn_hi,
2672                rm_lo,
2673                rm_hi,
2674                cond,
2675            } => {
2676                use synth_synthesis::Condition;
2677                let rd_bits = reg_to_bits(rd) as u16;
2678                let mut bytes = Vec::new();
2679
2680                // Helper: encode CMP Rn, Rm (16-bit)
2681                let encode_cmp_reg = |rn: &synth_synthesis::Reg,
2682                                      rm: &synth_synthesis::Reg|
2683                 -> Vec<u8> {
2684                    let rn_bits = reg_to_bits(rn) as u16;
2685                    let rm_bits = reg_to_bits(rm) as u16;
2686                    if rn_bits < 8 && rm_bits < 8 {
2687                        let instr: u16 = 0x4280 | (rm_bits << 3) | rn_bits;
2688                        instr.to_le_bytes().to_vec()
2689                    } else {
2690                        let n_bit = (rn_bits >> 3) & 1;
2691                        let instr: u16 = 0x4500 | (n_bit << 7) | (rm_bits << 3) | (rn_bits & 0x7);
2692                        instr.to_le_bytes().to_vec()
2693                    }
2694                };
2695
2696                // Helper: encode ITE <cond> (2 bytes)
2697                let encode_ite = |cond_bits: u16| -> Vec<u8> {
2698                    let mask = if (cond_bits & 1) == 0 { 0xC } else { 0x4 };
2699                    let ite_instr: u16 = 0xBF00 | (cond_bits << 4) | mask;
2700                    ite_instr.to_le_bytes().to_vec()
2701                };
2702
2703                // Helper: encode SetCond (ITE + MOV #1 + MOV #0) for given condition
2704                let encode_setcond = |cond_bits: u16, rd_bits: u16| -> Vec<u8> {
2705                    let mut b = encode_ite(cond_bits);
2706                    let mov_one: u16 = 0x2001 | (rd_bits << 8);
2707                    let mov_zero: u16 = 0x2000 | (rd_bits << 8);
2708                    b.extend_from_slice(&mov_one.to_le_bytes());
2709                    b.extend_from_slice(&mov_zero.to_le_bytes());
2710                    b
2711                };
2712
2713                match cond {
2714                    Condition::EQ | Condition::NE => {
2715                        // CMP rn_lo, rm_lo (compare low words)
2716                        bytes.extend_from_slice(&encode_cmp_reg(rn_lo, rm_lo));
2717
2718                        // IT EQ (execute next instruction only if Z=1)
2719                        let it_eq: u16 = 0xBF08; // IT EQ: cond=0000, mask=1000
2720                        bytes.extend_from_slice(&it_eq.to_le_bytes());
2721
2722                        // CMPEQ rn_hi, rm_hi (compare high words, only if low equal)
2723                        bytes.extend_from_slice(&encode_cmp_reg(rn_hi, rm_hi));
2724
2725                        // ITE <cond>; MOV rd, #1; MOV rd, #0
2726                        let cond_bits: u16 = match cond {
2727                            Condition::EQ => 0x0,
2728                            Condition::NE => 0x1,
2729                            _ => unreachable!(),
2730                        };
2731                        bytes.extend_from_slice(&encode_setcond(cond_bits, rd_bits));
2732                    }
2733
2734                    Condition::LT => {
2735                        // CMP rn_lo, rm_lo (sets C flag for borrow)
2736                        bytes.extend_from_slice(&encode_cmp_reg(rn_lo, rm_lo));
2737
2738                        // SBCS rd, rn_hi, rm_hi (subtract with carry, sets N,V flags)
2739                        // SBCS.W Rd, Rn, Rm: EB70 Rn | 0000 Rd 0000 Rm
2740                        let rn_hi_bits = reg_to_bits(rn_hi);
2741                        let rm_hi_bits = reg_to_bits(rm_hi);
2742                        let hw1: u16 = (0xEB70 | rn_hi_bits) as u16;
2743                        let hw2: u16 = ((rd_bits as u32) << 8 | rm_hi_bits) as u16;
2744                        bytes.extend_from_slice(&hw1.to_le_bytes());
2745                        bytes.extend_from_slice(&hw2.to_le_bytes());
2746
2747                        // ITE LT; MOV rd, #1; MOV rd, #0
2748                        bytes.extend_from_slice(&encode_setcond(0xB, rd_bits)); // LT = 0xB
2749                    }
2750
2751                    Condition::GT => {
2752                        // GT(a,b) = LT(b,a): swap operands
2753                        // CMP rm_lo, rn_lo (swapped)
2754                        bytes.extend_from_slice(&encode_cmp_reg(rm_lo, rn_lo));
2755
2756                        // SBCS rd, rm_hi, rn_hi (swapped)
2757                        let rm_hi_bits = reg_to_bits(rm_hi);
2758                        let rn_hi_bits = reg_to_bits(rn_hi);
2759                        let hw1: u16 = (0xEB70 | rm_hi_bits) as u16;
2760                        let hw2: u16 = ((rd_bits as u32) << 8 | rn_hi_bits) as u16;
2761                        bytes.extend_from_slice(&hw1.to_le_bytes());
2762                        bytes.extend_from_slice(&hw2.to_le_bytes());
2763
2764                        // ITE LT; MOV rd, #1; MOV rd, #0
2765                        bytes.extend_from_slice(&encode_setcond(0xB, rd_bits)); // LT = 0xB
2766                    }
2767
2768                    Condition::LE => {
2769                        // LE(a,b) = !GT(a,b): use GT logic but invert result
2770                        // GT(a,b) = LT(b,a): so we do CMP(b,a) and check LT, then invert
2771                        // CMP rm_lo, rn_lo (swapped, same as GT)
2772                        bytes.extend_from_slice(&encode_cmp_reg(rm_lo, rn_lo));
2773
2774                        // SBCS rd, rm_hi, rn_hi (swapped)
2775                        let rm_hi_bits = reg_to_bits(rm_hi);
2776                        let rn_hi_bits = reg_to_bits(rn_hi);
2777                        let hw1: u16 = (0xEB70 | rm_hi_bits) as u16;
2778                        let hw2: u16 = ((rd_bits as u32) << 8 | rn_hi_bits) as u16;
2779                        bytes.extend_from_slice(&hw1.to_le_bytes());
2780                        bytes.extend_from_slice(&hw2.to_le_bytes());
2781
2782                        // ITE GE; MOV rd, #1; MOV rd, #0 (GE is !LT, so inverting GT result)
2783                        bytes.extend_from_slice(&encode_setcond(0xA, rd_bits)); // GE = 0xA
2784                    }
2785
2786                    Condition::GE => {
2787                        // GE(a,b) = !LT(a,b): use LT logic but invert result
2788                        // CMP rn_lo, rm_lo (same as LT)
2789                        bytes.extend_from_slice(&encode_cmp_reg(rn_lo, rm_lo));
2790
2791                        // SBCS rd, rn_hi, rm_hi (same as LT)
2792                        let rn_hi_bits = reg_to_bits(rn_hi);
2793                        let rm_hi_bits = reg_to_bits(rm_hi);
2794                        let hw1: u16 = (0xEB70 | rn_hi_bits) as u16;
2795                        let hw2: u16 = ((rd_bits as u32) << 8 | rm_hi_bits) as u16;
2796                        bytes.extend_from_slice(&hw1.to_le_bytes());
2797                        bytes.extend_from_slice(&hw2.to_le_bytes());
2798
2799                        // ITE GE; MOV rd, #1; MOV rd, #0 (GE is !LT)
2800                        bytes.extend_from_slice(&encode_setcond(0xA, rd_bits)); // GE = 0xA
2801                    }
2802
2803                    // Unsigned comparisons - same instruction sequence, different conditions
2804                    Condition::LO => {
2805                        // LO (unsigned LT): CMP lo, SBCS hi, check C=0
2806                        bytes.extend_from_slice(&encode_cmp_reg(rn_lo, rm_lo));
2807                        let rn_hi_bits = reg_to_bits(rn_hi);
2808                        let rm_hi_bits = reg_to_bits(rm_hi);
2809                        let hw1: u16 = (0xEB70 | rn_hi_bits) as u16;
2810                        let hw2: u16 = ((rd_bits as u32) << 8 | rm_hi_bits) as u16;
2811                        bytes.extend_from_slice(&hw1.to_le_bytes());
2812                        bytes.extend_from_slice(&hw2.to_le_bytes());
2813                        bytes.extend_from_slice(&encode_setcond(0x3, rd_bits)); // LO = 0x3 (CC)
2814                    }
2815
2816                    Condition::HI => {
2817                        // HI (unsigned GT): swap operands and check LO
2818                        bytes.extend_from_slice(&encode_cmp_reg(rm_lo, rn_lo));
2819                        let rm_hi_bits = reg_to_bits(rm_hi);
2820                        let rn_hi_bits = reg_to_bits(rn_hi);
2821                        let hw1: u16 = (0xEB70 | rm_hi_bits) as u16;
2822                        let hw2: u16 = ((rd_bits as u32) << 8 | rn_hi_bits) as u16;
2823                        bytes.extend_from_slice(&hw1.to_le_bytes());
2824                        bytes.extend_from_slice(&hw2.to_le_bytes());
2825                        bytes.extend_from_slice(&encode_setcond(0x3, rd_bits)); // LO = 0x3 (CC)
2826                    }
2827
2828                    Condition::LS => {
2829                        // LS (unsigned LE): !(a > b) = !(HI), so do HI and invert
2830                        bytes.extend_from_slice(&encode_cmp_reg(rm_lo, rn_lo));
2831                        let rm_hi_bits = reg_to_bits(rm_hi);
2832                        let rn_hi_bits = reg_to_bits(rn_hi);
2833                        let hw1: u16 = (0xEB70 | rm_hi_bits) as u16;
2834                        let hw2: u16 = ((rd_bits as u32) << 8 | rn_hi_bits) as u16;
2835                        bytes.extend_from_slice(&hw1.to_le_bytes());
2836                        bytes.extend_from_slice(&hw2.to_le_bytes());
2837                        bytes.extend_from_slice(&encode_setcond(0x2, rd_bits)); // HS = 0x2 (CS) = !LO
2838                    }
2839
2840                    Condition::HS => {
2841                        // HS (unsigned GE): !(a < b) = !(LO)
2842                        bytes.extend_from_slice(&encode_cmp_reg(rn_lo, rm_lo));
2843                        let rn_hi_bits = reg_to_bits(rn_hi);
2844                        let rm_hi_bits = reg_to_bits(rm_hi);
2845                        let hw1: u16 = (0xEB70 | rn_hi_bits) as u16;
2846                        let hw2: u16 = ((rd_bits as u32) << 8 | rm_hi_bits) as u16;
2847                        bytes.extend_from_slice(&hw1.to_le_bytes());
2848                        bytes.extend_from_slice(&hw2.to_le_bytes());
2849                        bytes.extend_from_slice(&encode_setcond(0x2, rd_bits)); // HS = 0x2 (CS) = !LO
2850                    }
2851                }
2852
2853                Ok(bytes)
2854            }
2855
2856            // I64SetCondZ: Test if i64 register pair is zero, result 0/1 in rd
2857            // ORR.W rd, rn_lo, rn_hi; CMP rd, #0; ITE EQ; MOV 1; MOV 0
2858            ArmOp::I64SetCondZ { rd, rn_lo, rn_hi } => {
2859                let rd_bits = reg_to_bits(rd);
2860                let rn_lo_bits = reg_to_bits(rn_lo);
2861                let rn_hi_bits = reg_to_bits(rn_hi);
2862                let mut bytes = Vec::new();
2863
2864                // ORR.W rd, rn_lo, rn_hi: EA40 rn_lo | 0000 rd 0000 rn_hi
2865                let hw1: u16 = (0xEA40 | rn_lo_bits) as u16;
2866                let hw2: u16 = ((rd_bits << 8) | rn_hi_bits) as u16;
2867                bytes.extend_from_slice(&hw1.to_le_bytes());
2868                bytes.extend_from_slice(&hw2.to_le_bytes());
2869
2870                // CMP rd, #0 (16-bit): 0010 1 Rd 0000 0000
2871                let cmp_instr: u16 = 0x2800 | ((rd_bits as u16) << 8);
2872                bytes.extend_from_slice(&cmp_instr.to_le_bytes());
2873
2874                // ITE EQ; MOV rd, #1; MOV rd, #0
2875                let mask = 0xC_u16; // ITE EQ mask: firstcond[0]=0, mask=0xC
2876                let ite_instr: u16 = 0xBF00 | mask;
2877                bytes.extend_from_slice(&ite_instr.to_le_bytes());
2878                let mov_one: u16 = 0x2001 | ((rd_bits as u16) << 8);
2879                let mov_zero: u16 = 0x2000 | ((rd_bits as u16) << 8);
2880                bytes.extend_from_slice(&mov_one.to_le_bytes());
2881                bytes.extend_from_slice(&mov_zero.to_le_bytes());
2882
2883                Ok(bytes)
2884            }
2885
2886            // I64Mul: 64-bit multiply using UMULL + MLA cross products
2887            // Formula: result = (a_lo * b_lo) + ((a_lo * b_hi + a_hi * b_lo) << 32)
2888            // Uses R12 as scratch register
2889            ArmOp::I64Mul {
2890                rd_lo,
2891                rd_hi,
2892                rn_lo,
2893                rn_hi,
2894                rm_lo,
2895                rm_hi,
2896            } => {
2897                let rd_lo_bits = reg_to_bits(rd_lo);
2898                let rd_hi_bits = reg_to_bits(rd_hi);
2899                let rn_lo_bits = reg_to_bits(rn_lo);
2900                let rn_hi_bits = reg_to_bits(rn_hi);
2901                let rm_lo_bits = reg_to_bits(rm_lo);
2902                let rm_hi_bits = reg_to_bits(rm_hi);
2903                let r12: u32 = 12; // IP scratch register
2904                let mut bytes = Vec::new();
2905
2906                // 1. MUL R12, rn_lo, rm_hi  (R12 = a_lo * b_hi)
2907                // Thumb-2 MUL: hw1=0xFB00|Rn, hw2=0xF000|(Rd<<8)|Rm
2908                let hw1: u16 = (0xFB00 | rn_lo_bits) as u16;
2909                let hw2: u16 = (0xF000 | (r12 << 8) | rm_hi_bits) as u16;
2910                bytes.extend_from_slice(&hw1.to_le_bytes());
2911                bytes.extend_from_slice(&hw2.to_le_bytes());
2912
2913                // 2. MLA R12, rn_hi, rm_lo, R12  (R12 += a_hi * b_lo)
2914                // Thumb-2 MLA: hw1=0xFB00|Rn, hw2=(Ra<<12)|(Rd<<8)|Rm
2915                let hw1: u16 = (0xFB00 | rn_hi_bits) as u16;
2916                let hw2: u16 = ((r12 << 12) | (r12 << 8) | rm_lo_bits) as u16;
2917                bytes.extend_from_slice(&hw1.to_le_bytes());
2918                bytes.extend_from_slice(&hw2.to_le_bytes());
2919
2920                // 3. UMULL rd_lo, rd_hi, rn_lo, rm_lo  (rd_lo:rd_hi = a_lo * b_lo)
2921                // Thumb-2 UMULL: hw1=0xFBA0|Rn, hw2=(RdLo<<12)|(RdHi<<8)|Rm
2922                let hw1: u16 = (0xFBA0 | rn_lo_bits) as u16;
2923                let hw2: u16 = ((rd_lo_bits << 12) | (rd_hi_bits << 8) | rm_lo_bits) as u16;
2924                bytes.extend_from_slice(&hw1.to_le_bytes());
2925                bytes.extend_from_slice(&hw2.to_le_bytes());
2926
2927                // 4. ADD rd_hi, R12  (rd_hi += cross products)
2928                // 16-bit high reg ADD: 01000100 D Rm Rdn[2:0]
2929                let d_bit = (rd_hi_bits >> 3) & 1;
2930                let add_instr: u16 =
2931                    (0x4400 | (d_bit << 7) | (r12 << 3) | (rd_hi_bits & 0x7)) as u16;
2932                bytes.extend_from_slice(&add_instr.to_le_bytes());
2933
2934                Ok(bytes)
2935            }
2936
2937            // I64Shl: 64-bit shift left with branch for n<32 vs n>=32
2938            // rm_hi (R3) is used as temp register
2939            ArmOp::I64Shl {
2940                rd_lo,
2941                rd_hi,
2942                rn_lo,
2943                rn_hi,
2944                rm_lo,
2945                rm_hi,
2946            } => {
2947                let rd_lo_bits = reg_to_bits(rd_lo);
2948                let rd_hi_bits = reg_to_bits(rd_hi);
2949                let rn_lo_bits = reg_to_bits(rn_lo);
2950                let rn_hi_bits = reg_to_bits(rn_hi);
2951                let rm_lo_bits = reg_to_bits(rm_lo);
2952                let rm_hi_bits = reg_to_bits(rm_hi); // temp
2953                let mut bytes = Vec::new();
2954
2955                // AND.W rm_lo, rm_lo, #63  (mask shift amount to 6 bits)
2956                let hw1: u16 = (0xF000 | rm_lo_bits) as u16;
2957                let hw2: u16 = ((rm_lo_bits << 8) | 0x3F) as u16;
2958                bytes.extend_from_slice(&hw1.to_le_bytes());
2959                bytes.extend_from_slice(&hw2.to_le_bytes());
2960
2961                // SUBS.W rm_hi, rm_lo, #32  (rm_hi = n-32, sets flags)
2962                let hw1: u16 = (0xF1B0 | rm_lo_bits) as u16;
2963                let hw2: u16 = ((rm_hi_bits << 8) | 0x20) as u16;
2964                bytes.extend_from_slice(&hw1.to_le_bytes());
2965                bytes.extend_from_slice(&hw2.to_le_bytes());
2966
2967                // BPL .large (branch if n >= 32, offset = +10 halfwords)
2968                let bpl: u16 = 0xD50A;
2969                bytes.extend_from_slice(&bpl.to_le_bytes());
2970
2971                // --- Small shift (n < 32) ---
2972                // RSB.W rm_hi, rm_lo, #32  (rm_hi = 32-n)
2973                let hw1: u16 = (0xF1C0 | rm_lo_bits) as u16;
2974                let hw2: u16 = ((rm_hi_bits << 8) | 0x20) as u16;
2975                bytes.extend_from_slice(&hw1.to_le_bytes());
2976                bytes.extend_from_slice(&hw2.to_le_bytes());
2977
2978                // LSR.W rm_hi, rn_lo, rm_hi  (rm_hi = lo >> (32-n), overflow bits)
2979                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
2980                let hw2: u16 = (0xF000 | (rm_hi_bits << 8) | rm_hi_bits) as u16;
2981                bytes.extend_from_slice(&hw1.to_le_bytes());
2982                bytes.extend_from_slice(&hw2.to_le_bytes());
2983
2984                // LSL.W rd_hi, rn_hi, rm_lo  (hi <<= n)
2985                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
2986                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | rm_lo_bits) as u16;
2987                bytes.extend_from_slice(&hw1.to_le_bytes());
2988                bytes.extend_from_slice(&hw2.to_le_bytes());
2989
2990                // ORR.W rd_hi, rd_hi, rm_hi  (hi |= overflow bits from lo)
2991                let hw1: u16 = (0xEA40 | rd_hi_bits) as u16;
2992                let hw2: u16 = ((rd_hi_bits << 8) | rm_hi_bits) as u16;
2993                bytes.extend_from_slice(&hw1.to_le_bytes());
2994                bytes.extend_from_slice(&hw2.to_le_bytes());
2995
2996                // LSL.W rd_lo, rn_lo, rm_lo  (lo <<= n)
2997                let hw1: u16 = (0xFA00 | rn_lo_bits) as u16;
2998                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | rm_lo_bits) as u16;
2999                bytes.extend_from_slice(&hw1.to_le_bytes());
3000                bytes.extend_from_slice(&hw2.to_le_bytes());
3001
3002                // B .done (skip large shift: +2 halfwords)
3003                let b_done: u16 = 0xE002;
3004                bytes.extend_from_slice(&b_done.to_le_bytes());
3005
3006                // --- Large shift (n >= 32) ---
3007                // LSL.W rd_hi, rn_lo, rm_hi  (hi = lo << (n-32))
3008                let hw1: u16 = (0xFA00 | rn_lo_bits) as u16;
3009                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | rm_hi_bits) as u16;
3010                bytes.extend_from_slice(&hw1.to_le_bytes());
3011                bytes.extend_from_slice(&hw2.to_le_bytes());
3012
3013                // MOV rd_lo, #0
3014                let mov_zero: u16 = 0x2000 | ((rd_lo_bits as u16) << 8);
3015                bytes.extend_from_slice(&mov_zero.to_le_bytes());
3016
3017                Ok(bytes) // Total: 38 bytes
3018            }
3019
3020            // I64ShrU: 64-bit logical shift right with branch for n<32 vs n>=32
3021            ArmOp::I64ShrU {
3022                rd_lo,
3023                rd_hi,
3024                rn_lo,
3025                rn_hi,
3026                rm_lo,
3027                rm_hi,
3028            } => {
3029                let rd_lo_bits = reg_to_bits(rd_lo);
3030                let rd_hi_bits = reg_to_bits(rd_hi);
3031                let rn_lo_bits = reg_to_bits(rn_lo);
3032                let rn_hi_bits = reg_to_bits(rn_hi);
3033                let rm_lo_bits = reg_to_bits(rm_lo);
3034                let rm_hi_bits = reg_to_bits(rm_hi); // temp
3035                let mut bytes = Vec::new();
3036
3037                // AND.W rm_lo, rm_lo, #63
3038                let hw1: u16 = (0xF000 | rm_lo_bits) as u16;
3039                let hw2: u16 = ((rm_lo_bits << 8) | 0x3F) as u16;
3040                bytes.extend_from_slice(&hw1.to_le_bytes());
3041                bytes.extend_from_slice(&hw2.to_le_bytes());
3042
3043                // SUBS.W rm_hi, rm_lo, #32
3044                let hw1: u16 = (0xF1B0 | rm_lo_bits) as u16;
3045                let hw2: u16 = ((rm_hi_bits << 8) | 0x20) as u16;
3046                bytes.extend_from_slice(&hw1.to_le_bytes());
3047                bytes.extend_from_slice(&hw2.to_le_bytes());
3048
3049                // BPL .large (+10 halfwords)
3050                let bpl: u16 = 0xD50A;
3051                bytes.extend_from_slice(&bpl.to_le_bytes());
3052
3053                // --- Small shift (n < 32) ---
3054                // RSB.W rm_hi, rm_lo, #32  (rm_hi = 32-n)
3055                let hw1: u16 = (0xF1C0 | rm_lo_bits) as u16;
3056                let hw2: u16 = ((rm_hi_bits << 8) | 0x20) as u16;
3057                bytes.extend_from_slice(&hw1.to_le_bytes());
3058                bytes.extend_from_slice(&hw2.to_le_bytes());
3059
3060                // LSL.W rm_hi, rn_hi, rm_hi  (rm_hi = hi << (32-n), bits flowing to lo)
3061                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3062                let hw2: u16 = (0xF000 | (rm_hi_bits << 8) | rm_hi_bits) as u16;
3063                bytes.extend_from_slice(&hw1.to_le_bytes());
3064                bytes.extend_from_slice(&hw2.to_le_bytes());
3065
3066                // LSR.W rd_lo, rn_lo, rm_lo  (lo >>= n)
3067                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3068                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | rm_lo_bits) as u16;
3069                bytes.extend_from_slice(&hw1.to_le_bytes());
3070                bytes.extend_from_slice(&hw2.to_le_bytes());
3071
3072                // ORR.W rd_lo, rd_lo, rm_hi  (lo |= overflow from hi)
3073                let hw1: u16 = (0xEA40 | rd_lo_bits) as u16;
3074                let hw2: u16 = ((rd_lo_bits << 8) | rm_hi_bits) as u16;
3075                bytes.extend_from_slice(&hw1.to_le_bytes());
3076                bytes.extend_from_slice(&hw2.to_le_bytes());
3077
3078                // LSR.W rd_hi, rn_hi, rm_lo  (hi >>= n, logical)
3079                let hw1: u16 = (0xFA20 | rn_hi_bits) as u16;
3080                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | rm_lo_bits) as u16;
3081                bytes.extend_from_slice(&hw1.to_le_bytes());
3082                bytes.extend_from_slice(&hw2.to_le_bytes());
3083
3084                // B .done (+2 halfwords)
3085                let b_done: u16 = 0xE002;
3086                bytes.extend_from_slice(&b_done.to_le_bytes());
3087
3088                // --- Large shift (n >= 32) ---
3089                // LSR.W rd_lo, rn_hi, rm_hi  (lo = hi >> (n-32))
3090                let hw1: u16 = (0xFA20 | rn_hi_bits) as u16;
3091                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | rm_hi_bits) as u16;
3092                bytes.extend_from_slice(&hw1.to_le_bytes());
3093                bytes.extend_from_slice(&hw2.to_le_bytes());
3094
3095                // MOV rd_hi, #0
3096                let mov_zero: u16 = 0x2000 | ((rd_hi_bits as u16) << 8);
3097                bytes.extend_from_slice(&mov_zero.to_le_bytes());
3098
3099                Ok(bytes) // Total: 38 bytes
3100            }
3101
3102            // I64ShrS: 64-bit arithmetic shift right with branch for n<32 vs n>=32
3103            ArmOp::I64ShrS {
3104                rd_lo,
3105                rd_hi,
3106                rn_lo,
3107                rn_hi,
3108                rm_lo,
3109                rm_hi,
3110            } => {
3111                let rd_lo_bits = reg_to_bits(rd_lo);
3112                let rd_hi_bits = reg_to_bits(rd_hi);
3113                let rn_lo_bits = reg_to_bits(rn_lo);
3114                let rn_hi_bits = reg_to_bits(rn_hi);
3115                let rm_lo_bits = reg_to_bits(rm_lo);
3116                let rm_hi_bits = reg_to_bits(rm_hi); // temp
3117                let mut bytes = Vec::new();
3118
3119                // AND.W rm_lo, rm_lo, #63
3120                let hw1: u16 = (0xF000 | rm_lo_bits) as u16;
3121                let hw2: u16 = ((rm_lo_bits << 8) | 0x3F) as u16;
3122                bytes.extend_from_slice(&hw1.to_le_bytes());
3123                bytes.extend_from_slice(&hw2.to_le_bytes());
3124
3125                // SUBS.W rm_hi, rm_lo, #32
3126                let hw1: u16 = (0xF1B0 | rm_lo_bits) as u16;
3127                let hw2: u16 = ((rm_hi_bits << 8) | 0x20) as u16;
3128                bytes.extend_from_slice(&hw1.to_le_bytes());
3129                bytes.extend_from_slice(&hw2.to_le_bytes());
3130
3131                // BPL .large (+10 halfwords)
3132                let bpl: u16 = 0xD50A;
3133                bytes.extend_from_slice(&bpl.to_le_bytes());
3134
3135                // --- Small shift (n < 32) ---
3136                // RSB.W rm_hi, rm_lo, #32
3137                let hw1: u16 = (0xF1C0 | rm_lo_bits) as u16;
3138                let hw2: u16 = ((rm_hi_bits << 8) | 0x20) as u16;
3139                bytes.extend_from_slice(&hw1.to_le_bytes());
3140                bytes.extend_from_slice(&hw2.to_le_bytes());
3141
3142                // LSL.W rm_hi, rn_hi, rm_hi  (rm_hi = hi << (32-n), bits flowing to lo)
3143                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3144                let hw2: u16 = (0xF000 | (rm_hi_bits << 8) | rm_hi_bits) as u16;
3145                bytes.extend_from_slice(&hw1.to_le_bytes());
3146                bytes.extend_from_slice(&hw2.to_le_bytes());
3147
3148                // LSR.W rd_lo, rn_lo, rm_lo  (lo >>= n, logical for lo word)
3149                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3150                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | rm_lo_bits) as u16;
3151                bytes.extend_from_slice(&hw1.to_le_bytes());
3152                bytes.extend_from_slice(&hw2.to_le_bytes());
3153
3154                // ORR.W rd_lo, rd_lo, rm_hi  (lo |= overflow from hi)
3155                let hw1: u16 = (0xEA40 | rd_lo_bits) as u16;
3156                let hw2: u16 = ((rd_lo_bits << 8) | rm_hi_bits) as u16;
3157                bytes.extend_from_slice(&hw1.to_le_bytes());
3158                bytes.extend_from_slice(&hw2.to_le_bytes());
3159
3160                // ASR.W rd_hi, rn_hi, rm_lo  (hi >>= n, arithmetic/sign-extending)
3161                let hw1: u16 = (0xFA40 | rn_hi_bits) as u16;
3162                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | rm_lo_bits) as u16;
3163                bytes.extend_from_slice(&hw1.to_le_bytes());
3164                bytes.extend_from_slice(&hw2.to_le_bytes());
3165
3166                // B .done (+3 halfwords, large shift is 8 bytes)
3167                let b_done: u16 = 0xE003;
3168                bytes.extend_from_slice(&b_done.to_le_bytes());
3169
3170                // --- Large shift (n >= 32) ---
3171                // ASR.W rd_lo, rn_hi, rm_hi  (lo = hi >>> (n-32))
3172                let hw1: u16 = (0xFA40 | rn_hi_bits) as u16;
3173                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | rm_hi_bits) as u16;
3174                bytes.extend_from_slice(&hw1.to_le_bytes());
3175                bytes.extend_from_slice(&hw2.to_le_bytes());
3176
3177                // ASR.W rd_hi, rn_hi, #31  (hi = sign extension, all 0s or all 1s)
3178                // Thumb-2 ASR immediate: hw1=0xEA4F, hw2=imm3:Rd:imm2:10:Rm
3179                // imm5=31=11111 → imm3=111, imm2=11
3180                let hw1: u16 = 0xEA4F;
3181                let hw2: u16 = (0x7000 | (rd_hi_bits << 8) | 0x00E0 | rn_hi_bits) as u16;
3182                bytes.extend_from_slice(&hw1.to_le_bytes());
3183                bytes.extend_from_slice(&hw2.to_le_bytes());
3184
3185                Ok(bytes) // Total: 40 bytes
3186            }
3187
3188            // I64Rotl: 64-bit rotate left
3189            // For n < 32: new_hi = (hi << n) | (lo >> (32-n)), new_lo = (lo << n) | (hi >> (32-n))
3190            // For n >= 32: same formula but with lo/hi conceptually swapped, shift by (n-32)
3191            // Uses R4 (saved/restored) and R12 as scratch
3192            ArmOp::I64Rotl {
3193                rdlo,
3194                rdhi,
3195                rnlo,
3196                rnhi,
3197                shift,
3198            } => {
3199                let rd_lo_bits = reg_to_bits(rdlo);
3200                let rd_hi_bits = reg_to_bits(rdhi);
3201                let rn_lo_bits = reg_to_bits(rnlo);
3202                let rn_hi_bits = reg_to_bits(rnhi);
3203                let shift_bits = reg_to_bits(shift);
3204                let r12: u32 = 12; // IP scratch
3205                let r3: u32 = 3; // Scratch (high word of shift amount, unused)
3206                let r4: u32 = 4; // Scratch (saved/restored)
3207                let mut bytes = Vec::new();
3208
3209                // PUSH {R4}
3210                bytes.extend_from_slice(&0xB410u16.to_le_bytes());
3211
3212                // AND.W shift, shift, #63 (mask to 6 bits)
3213                let hw1: u16 = (0xF000 | shift_bits) as u16;
3214                let hw2: u16 = ((shift_bits << 8) | 0x3F) as u16;
3215                bytes.extend_from_slice(&hw1.to_le_bytes());
3216                bytes.extend_from_slice(&hw2.to_le_bytes());
3217
3218                // SUBS.W R3, shift, #32 (R3 = n-32, sets flags)
3219                let hw1: u16 = (0xF1B0 | shift_bits) as u16;
3220                let hw2: u16 = ((r3 << 8) | 0x20) as u16;
3221                bytes.extend_from_slice(&hw1.to_le_bytes());
3222                bytes.extend_from_slice(&hw2.to_le_bytes());
3223
3224                // BPL .large (branch if n >= 32, offset = +14 halfwords)
3225                let bpl: u16 = 0xD50E;
3226                bytes.extend_from_slice(&bpl.to_le_bytes());
3227
3228                // === Small rotation (n < 32) ===
3229                // RSB.W R3, shift, #32 (R3 = 32-n)
3230                let hw1: u16 = (0xF1C0 | shift_bits) as u16;
3231                let hw2: u16 = ((r3 << 8) | 0x20) as u16;
3232                bytes.extend_from_slice(&hw1.to_le_bytes());
3233                bytes.extend_from_slice(&hw2.to_le_bytes());
3234
3235                // LSR.W R4, rn_lo, R3 (R4 = lo >> (32-n), will go to new_hi)
3236                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3237                let hw2: u16 = (0xF000 | (r4 << 8) | r3) as u16;
3238                bytes.extend_from_slice(&hw1.to_le_bytes());
3239                bytes.extend_from_slice(&hw2.to_le_bytes());
3240
3241                // LSR.W R12, rn_hi, R3 (R12 = hi >> (32-n), will go to new_lo)
3242                let hw1: u16 = (0xFA20 | rn_hi_bits) as u16;
3243                let hw2: u16 = (0xF000 | (r12 << 8) | r3) as u16;
3244                bytes.extend_from_slice(&hw1.to_le_bytes());
3245                bytes.extend_from_slice(&hw2.to_le_bytes());
3246
3247                // LSL.W rd_hi, rn_hi, shift (rd_hi = hi << n)
3248                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3249                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | shift_bits) as u16;
3250                bytes.extend_from_slice(&hw1.to_le_bytes());
3251                bytes.extend_from_slice(&hw2.to_le_bytes());
3252
3253                // ORR.W rd_hi, rd_hi, R4 (rd_hi = (hi << n) | (lo >> (32-n)))
3254                let hw1: u16 = (0xEA40 | rd_hi_bits) as u16;
3255                let hw2: u16 = ((rd_hi_bits << 8) | r4) as u16;
3256                bytes.extend_from_slice(&hw1.to_le_bytes());
3257                bytes.extend_from_slice(&hw2.to_le_bytes());
3258
3259                // LSL.W rd_lo, rn_lo, shift (rd_lo = lo << n)
3260                let hw1: u16 = (0xFA00 | rn_lo_bits) as u16;
3261                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | shift_bits) as u16;
3262                bytes.extend_from_slice(&hw1.to_le_bytes());
3263                bytes.extend_from_slice(&hw2.to_le_bytes());
3264
3265                // ORR.W rd_lo, rd_lo, R12 (rd_lo = (lo << n) | (hi >> (32-n)))
3266                let hw1: u16 = (0xEA40 | rd_lo_bits) as u16;
3267                let hw2: u16 = ((rd_lo_bits << 8) | r12) as u16;
3268                bytes.extend_from_slice(&hw1.to_le_bytes());
3269                bytes.extend_from_slice(&hw2.to_le_bytes());
3270
3271                // B .done (skip large block, offset = +14 halfwords)
3272                let b_done: u16 = 0xE00E;
3273                bytes.extend_from_slice(&b_done.to_le_bytes());
3274
3275                // === Large rotation (n >= 32) ===
3276                // R3 already has n-32 from the SUBS
3277                // RSB.W R4, R3, #32 (R4 = 32-(n-32) = 64-n)
3278                let hw1: u16 = (0xF1C0 | r3) as u16;
3279                let hw2: u16 = ((r4 << 8) | 0x20) as u16;
3280                bytes.extend_from_slice(&hw1.to_le_bytes());
3281                bytes.extend_from_slice(&hw2.to_le_bytes());
3282
3283                // LSR.W R12, rn_hi, R4 (R12 = hi >> (64-n), goes to new_hi low bits)
3284                let hw1: u16 = (0xFA20 | rn_hi_bits) as u16;
3285                let hw2: u16 = (0xF000 | (r12 << 8) | r4) as u16;
3286                bytes.extend_from_slice(&hw1.to_le_bytes());
3287                bytes.extend_from_slice(&hw2.to_le_bytes());
3288
3289                // LSR.W R4, rn_lo, R4 (R4 = lo >> (64-n), goes to new_lo low bits)
3290                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3291                let hw2: u16 = (0xF000 | (r4 << 8) | r4) as u16;
3292                bytes.extend_from_slice(&hw1.to_le_bytes());
3293                bytes.extend_from_slice(&hw2.to_le_bytes());
3294
3295                // LSL.W shift, rn_lo, R3 (shift = lo << (n-32), new_hi high bits)
3296                let hw1: u16 = (0xFA00 | rn_lo_bits) as u16;
3297                let hw2: u16 = (0xF000 | (shift_bits << 8) | r3) as u16;
3298                bytes.extend_from_slice(&hw1.to_le_bytes());
3299                bytes.extend_from_slice(&hw2.to_le_bytes());
3300
3301                // ORR.W shift, shift, R12 (shift = (lo << (n-32)) | (hi >> (64-n)) = new_hi)
3302                let hw1: u16 = (0xEA40 | shift_bits) as u16;
3303                let hw2: u16 = ((shift_bits << 8) | r12) as u16;
3304                bytes.extend_from_slice(&hw1.to_le_bytes());
3305                bytes.extend_from_slice(&hw2.to_le_bytes());
3306
3307                // LSL.W rd_lo, rn_hi, R3 (rd_lo = hi << (n-32), new_lo high bits)
3308                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3309                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | r3) as u16;
3310                bytes.extend_from_slice(&hw1.to_le_bytes());
3311                bytes.extend_from_slice(&hw2.to_le_bytes());
3312
3313                // ORR.W rd_lo, rd_lo, R4 (rd_lo = (hi << (n-32)) | (lo >> (64-n)) = new_lo)
3314                let hw1: u16 = (0xEA40 | rd_lo_bits) as u16;
3315                let hw2: u16 = ((rd_lo_bits << 8) | r4) as u16;
3316                bytes.extend_from_slice(&hw1.to_le_bytes());
3317                bytes.extend_from_slice(&hw2.to_le_bytes());
3318
3319                // MOV rd_hi, shift (rd_hi = new_hi)
3320                let d_bit = (rd_hi_bits >> 3) & 1;
3321                let mov_instr: u16 =
3322                    (0x4600 | (d_bit << 7) | (shift_bits << 3) | (rd_hi_bits & 0x7)) as u16;
3323                bytes.extend_from_slice(&mov_instr.to_le_bytes());
3324
3325                // POP {R4}
3326                bytes.extend_from_slice(&0xBC10u16.to_le_bytes());
3327
3328                Ok(bytes) // Total: 74 bytes
3329            }
3330
3331            // I64Rotr: 64-bit rotate right
3332            // rotr(x, n) = rotl(x, 64-n)
3333            // For n < 32: new_lo = (lo >> n) | (hi << (32-n)), new_hi = (hi >> n) | (lo << (32-n))
3334            // For n >= 32: same formula but with lo/hi swapped, shift by (n-32)
3335            ArmOp::I64Rotr {
3336                rdlo,
3337                rdhi,
3338                rnlo,
3339                rnhi,
3340                shift,
3341            } => {
3342                let rd_lo_bits = reg_to_bits(rdlo);
3343                let rd_hi_bits = reg_to_bits(rdhi);
3344                let rn_lo_bits = reg_to_bits(rnlo);
3345                let rn_hi_bits = reg_to_bits(rnhi);
3346                let shift_bits = reg_to_bits(shift);
3347                let r12: u32 = 12;
3348                let r3: u32 = 3;
3349                let r4: u32 = 4;
3350                let mut bytes = Vec::new();
3351
3352                // PUSH {R4}
3353                bytes.extend_from_slice(&0xB410u16.to_le_bytes());
3354
3355                // AND.W shift, shift, #63
3356                let hw1: u16 = (0xF000 | shift_bits) as u16;
3357                let hw2: u16 = ((shift_bits << 8) | 0x3F) as u16;
3358                bytes.extend_from_slice(&hw1.to_le_bytes());
3359                bytes.extend_from_slice(&hw2.to_le_bytes());
3360
3361                // SUBS.W R3, shift, #32
3362                let hw1: u16 = (0xF1B0 | shift_bits) as u16;
3363                let hw2: u16 = ((r3 << 8) | 0x20) as u16;
3364                bytes.extend_from_slice(&hw1.to_le_bytes());
3365                bytes.extend_from_slice(&hw2.to_le_bytes());
3366
3367                // BPL .large (+14 halfwords)
3368                let bpl: u16 = 0xD50E;
3369                bytes.extend_from_slice(&bpl.to_le_bytes());
3370
3371                // === Small rotation (n < 32) ===
3372                // RSB.W R3, shift, #32 (R3 = 32-n)
3373                let hw1: u16 = (0xF1C0 | shift_bits) as u16;
3374                let hw2: u16 = ((r3 << 8) | 0x20) as u16;
3375                bytes.extend_from_slice(&hw1.to_le_bytes());
3376                bytes.extend_from_slice(&hw2.to_le_bytes());
3377
3378                // LSL.W R4, rn_hi, R3 (R4 = hi << (32-n), will go to new_lo)
3379                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3380                let hw2: u16 = (0xF000 | (r4 << 8) | r3) as u16;
3381                bytes.extend_from_slice(&hw1.to_le_bytes());
3382                bytes.extend_from_slice(&hw2.to_le_bytes());
3383
3384                // LSL.W R12, rn_lo, R3 (R12 = lo << (32-n), will go to new_hi)
3385                let hw1: u16 = (0xFA00 | rn_lo_bits) as u16;
3386                let hw2: u16 = (0xF000 | (r12 << 8) | r3) as u16;
3387                bytes.extend_from_slice(&hw1.to_le_bytes());
3388                bytes.extend_from_slice(&hw2.to_le_bytes());
3389
3390                // LSR.W rd_lo, rn_lo, shift (rd_lo = lo >> n)
3391                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3392                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | shift_bits) as u16;
3393                bytes.extend_from_slice(&hw1.to_le_bytes());
3394                bytes.extend_from_slice(&hw2.to_le_bytes());
3395
3396                // ORR.W rd_lo, rd_lo, R4 (rd_lo = (lo >> n) | (hi << (32-n)))
3397                let hw1: u16 = (0xEA40 | rd_lo_bits) as u16;
3398                let hw2: u16 = ((rd_lo_bits << 8) | r4) as u16;
3399                bytes.extend_from_slice(&hw1.to_le_bytes());
3400                bytes.extend_from_slice(&hw2.to_le_bytes());
3401
3402                // LSR.W rd_hi, rn_hi, shift (rd_hi = hi >> n)
3403                let hw1: u16 = (0xFA20 | rn_hi_bits) as u16;
3404                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | shift_bits) as u16;
3405                bytes.extend_from_slice(&hw1.to_le_bytes());
3406                bytes.extend_from_slice(&hw2.to_le_bytes());
3407
3408                // ORR.W rd_hi, rd_hi, R12 (rd_hi = (hi >> n) | (lo << (32-n)))
3409                let hw1: u16 = (0xEA40 | rd_hi_bits) as u16;
3410                let hw2: u16 = ((rd_hi_bits << 8) | r12) as u16;
3411                bytes.extend_from_slice(&hw1.to_le_bytes());
3412                bytes.extend_from_slice(&hw2.to_le_bytes());
3413
3414                // B .done (+14 halfwords)
3415                let b_done: u16 = 0xE00E;
3416                bytes.extend_from_slice(&b_done.to_le_bytes());
3417
3418                // === Large rotation (n >= 32) ===
3419                // RSB.W R4, R3, #32 (R4 = 64-n)
3420                let hw1: u16 = (0xF1C0 | r3) as u16;
3421                let hw2: u16 = ((r4 << 8) | 0x20) as u16;
3422                bytes.extend_from_slice(&hw1.to_le_bytes());
3423                bytes.extend_from_slice(&hw2.to_le_bytes());
3424
3425                // LSL.W R12, rn_lo, R4 (R12 = lo << (64-n), goes to new_lo low bits)
3426                let hw1: u16 = (0xFA00 | rn_lo_bits) as u16;
3427                let hw2: u16 = (0xF000 | (r12 << 8) | r4) as u16;
3428                bytes.extend_from_slice(&hw1.to_le_bytes());
3429                bytes.extend_from_slice(&hw2.to_le_bytes());
3430
3431                // LSL.W R4, rn_hi, R4 (R4 = hi << (64-n), goes to new_hi low bits)
3432                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3433                let hw2: u16 = (0xF000 | (r4 << 8) | r4) as u16;
3434                bytes.extend_from_slice(&hw1.to_le_bytes());
3435                bytes.extend_from_slice(&hw2.to_le_bytes());
3436
3437                // LSR.W shift, rn_hi, R3 (shift = hi >> (n-32), new_lo high bits)
3438                let hw1: u16 = (0xFA20 | rn_hi_bits) as u16;
3439                let hw2: u16 = (0xF000 | (shift_bits << 8) | r3) as u16;
3440                bytes.extend_from_slice(&hw1.to_le_bytes());
3441                bytes.extend_from_slice(&hw2.to_le_bytes());
3442
3443                // ORR.W shift, shift, R12 (shift = (hi >> (n-32)) | (lo << (64-n)) = new_lo)
3444                let hw1: u16 = (0xEA40 | shift_bits) as u16;
3445                let hw2: u16 = ((shift_bits << 8) | r12) as u16;
3446                bytes.extend_from_slice(&hw1.to_le_bytes());
3447                bytes.extend_from_slice(&hw2.to_le_bytes());
3448
3449                // LSR.W rd_hi, rn_lo, R3 (rd_hi = lo >> (n-32), new_hi high bits)
3450                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3451                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | r3) as u16;
3452                bytes.extend_from_slice(&hw1.to_le_bytes());
3453                bytes.extend_from_slice(&hw2.to_le_bytes());
3454
3455                // ORR.W rd_hi, rd_hi, R4 (rd_hi = (lo >> (n-32)) | (hi << (64-n)) = new_hi)
3456                let hw1: u16 = (0xEA40 | rd_hi_bits) as u16;
3457                let hw2: u16 = ((rd_hi_bits << 8) | r4) as u16;
3458                bytes.extend_from_slice(&hw1.to_le_bytes());
3459                bytes.extend_from_slice(&hw2.to_le_bytes());
3460
3461                // MOV rd_lo, shift (rd_lo = new_lo)
3462                let d_bit = (rd_lo_bits >> 3) & 1;
3463                let mov_instr: u16 =
3464                    (0x4600 | (d_bit << 7) | (shift_bits << 3) | (rd_lo_bits & 0x7)) as u16;
3465                bytes.extend_from_slice(&mov_instr.to_le_bytes());
3466
3467                // POP {R4}
3468                bytes.extend_from_slice(&0xBC10u16.to_le_bytes());
3469
3470                Ok(bytes) // Total: 74 bytes
3471            }
3472
3473            // I64Clz: Count leading zeros in 64-bit value
3474            // If hi != 0: result = CLZ(hi)
3475            // If hi == 0: result = 32 + CLZ(lo)
3476            //
3477            // Layout (using CMP+BNE approach for consistency):
3478            // 0: CMP.W rnhi, #0 (4 bytes)
3479            // 4: BEQ .hi_zero (2 bytes) - branch forward to offset 14
3480            // 6: CLZ.W rd, rnhi (4 bytes)
3481            // 10: B .done (2 bytes) - branch forward to offset 22
3482            // 12: NOP (2 bytes) - padding for alignment
3483            // 14: .hi_zero: CLZ.W rd, rnlo (4 bytes)
3484            // 18: ADD.W rd, rd, #32 (4 bytes)
3485            // 22: .done
3486            ArmOp::I64Clz { rd, rnlo, rnhi } => {
3487                let rd_bits = reg_to_bits(rd);
3488                let rn_lo_bits = reg_to_bits(rnlo);
3489                let rn_hi_bits = reg_to_bits(rnhi);
3490                let mut bytes = Vec::new();
3491
3492                // CMP.W rnhi, #0 (4 bytes at offset 0)
3493                let hw1: u16 = (0xF1B0 | rn_hi_bits) as u16;
3494                let hw2: u16 = 0x0F00;
3495                bytes.extend_from_slice(&hw1.to_le_bytes());
3496                bytes.extend_from_slice(&hw2.to_le_bytes());
3497
3498                // BEQ .hi_zero (2 bytes at offset 4)
3499                // PC = 4 + 4 = 8, target = 14, offset = 6, imm8 = 3
3500                let beq: u16 = 0xD003;
3501                bytes.extend_from_slice(&beq.to_le_bytes());
3502
3503                // CLZ.W rd, rnhi (4 bytes at offset 6)
3504                // CLZ T1: hw1 = 0xFAB<Rm>, hw2 = 0xF<Rd>8<Rm>
3505                let hw1: u16 = (0xFAB0 | rn_hi_bits) as u16;
3506                let hw2: u16 = (0xF080 | (rd_bits << 8) | rn_hi_bits) as u16;
3507                bytes.extend_from_slice(&hw1.to_le_bytes());
3508                bytes.extend_from_slice(&hw2.to_le_bytes());
3509
3510                // B .done (2 bytes at offset 10)
3511                // PC = 10 + 4 = 14, target = 22, offset = 8, imm11 = 4
3512                let b_done: u16 = 0xE004;
3513                bytes.extend_from_slice(&b_done.to_le_bytes());
3514
3515                // NOP (2 bytes at offset 12) - padding
3516                bytes.extend_from_slice(&0xBF00u16.to_le_bytes());
3517
3518                // .hi_zero: (offset 14)
3519                // CLZ.W rd, rnlo (4 bytes)
3520                // CLZ T1: hw1 = 0xFAB<Rm>, hw2 = 0xF<Rd>8<Rm>
3521                let hw1: u16 = (0xFAB0 | rn_lo_bits) as u16;
3522                let hw2: u16 = (0xF080 | (rd_bits << 8) | rn_lo_bits) as u16;
3523                bytes.extend_from_slice(&hw1.to_le_bytes());
3524                bytes.extend_from_slice(&hw2.to_le_bytes());
3525
3526                // ADD.W rd, rd, #32 (4 bytes at offset 18)
3527                let hw1: u16 = (0xF100 | rd_bits) as u16;
3528                let hw2: u16 = ((rd_bits << 8) | 0x20) as u16;
3529                bytes.extend_from_slice(&hw1.to_le_bytes());
3530                bytes.extend_from_slice(&hw2.to_le_bytes());
3531
3532                // .done: (offset 22)
3533                // i64.clz returns i64, so clear high word: MOV rnhi, #0 (2 bytes)
3534                // MOVS Rn, #0: 0010 0 Rn 00000000
3535                let mov0: u16 = (0x2000 | (rn_hi_bits << 8)) as u16;
3536                bytes.extend_from_slice(&mov0.to_le_bytes());
3537
3538                Ok(bytes)
3539            }
3540
3541            // I64Ctz: Count trailing zeros in 64-bit value
3542            // If lo != 0: result = CTZ(lo) = CLZ(RBIT(lo))
3543            // If lo == 0: result = 32 + CTZ(hi) = 32 + CLZ(RBIT(hi))
3544            //
3545            // Layout:
3546            // 0: CMP.W rnlo, #0 (4 bytes)
3547            // 4: BEQ .lo_zero (2 bytes) - branch to offset 18
3548            // 6: RBIT.W rd, rnlo (4 bytes)
3549            // 10: CLZ.W rd, rd (4 bytes)
3550            // 14: B .done (2 bytes) - branch to offset 30
3551            // 16: NOP (2 bytes) - padding
3552            // 18: .lo_zero: RBIT.W rd, rnhi (4 bytes)
3553            // 22: CLZ.W rd, rd (4 bytes)
3554            // 26: ADD.W rd, rd, #32 (4 bytes)
3555            // 30: .done
3556            ArmOp::I64Ctz { rd, rnlo, rnhi } => {
3557                let rd_bits = reg_to_bits(rd);
3558                let rn_lo_bits = reg_to_bits(rnlo);
3559                let rn_hi_bits = reg_to_bits(rnhi);
3560                let mut bytes = Vec::new();
3561
3562                // CMP.W rnlo, #0 (4 bytes at offset 0)
3563                let hw1: u16 = (0xF1B0 | rn_lo_bits) as u16;
3564                let hw2: u16 = 0x0F00;
3565                bytes.extend_from_slice(&hw1.to_le_bytes());
3566                bytes.extend_from_slice(&hw2.to_le_bytes());
3567
3568                // BEQ .lo_zero (2 bytes at offset 4)
3569                // PC = 4 + 4 = 8, target = 18, offset = 10, imm8 = 5
3570                let beq: u16 = 0xD005;
3571                bytes.extend_from_slice(&beq.to_le_bytes());
3572
3573                // RBIT.W rd, rnlo (4 bytes at offset 6)
3574                // RBIT T1: hw1 = 0xFA9<Rm>, hw2 = 0xF<Rd>A<Rm>
3575                let hw1: u16 = (0xFA90 | rn_lo_bits) as u16;
3576                let hw2: u16 = (0xF0A0 | (rd_bits << 8) | rn_lo_bits) as u16;
3577                bytes.extend_from_slice(&hw1.to_le_bytes());
3578                bytes.extend_from_slice(&hw2.to_le_bytes());
3579
3580                // CLZ.W rd, rd (4 bytes at offset 10)
3581                // CLZ T1: hw1 = 0xFAB<Rm>, hw2 = 0xF<Rd>8<Rm>
3582                let hw1: u16 = (0xFAB0 | rd_bits) as u16;
3583                let hw2: u16 = (0xF080 | (rd_bits << 8) | rd_bits) as u16;
3584                bytes.extend_from_slice(&hw1.to_le_bytes());
3585                bytes.extend_from_slice(&hw2.to_le_bytes());
3586
3587                // B .done (2 bytes at offset 14)
3588                // PC = 14 + 4 = 18, target = 30, offset = 12, imm11 = 6
3589                let b_done: u16 = 0xE006;
3590                bytes.extend_from_slice(&b_done.to_le_bytes());
3591
3592                // NOP (2 bytes at offset 16) - padding
3593                bytes.extend_from_slice(&0xBF00u16.to_le_bytes());
3594
3595                // .lo_zero: (offset 18)
3596                // RBIT.W rd, rnhi (4 bytes)
3597                // RBIT T1: hw1 = 0xFA9<Rm>, hw2 = 0xF<Rd>A<Rm>
3598                let hw1: u16 = (0xFA90 | rn_hi_bits) as u16;
3599                let hw2: u16 = (0xF0A0 | (rd_bits << 8) | rn_hi_bits) as u16;
3600                bytes.extend_from_slice(&hw1.to_le_bytes());
3601                bytes.extend_from_slice(&hw2.to_le_bytes());
3602
3603                // CLZ.W rd, rd (4 bytes at offset 22)
3604                // CLZ T1: hw1 = 0xFAB<Rm>, hw2 = 0xF<Rd>8<Rm>
3605                let hw1: u16 = (0xFAB0 | rd_bits) as u16;
3606                let hw2: u16 = (0xF080 | (rd_bits << 8) | rd_bits) as u16;
3607                bytes.extend_from_slice(&hw1.to_le_bytes());
3608                bytes.extend_from_slice(&hw2.to_le_bytes());
3609
3610                // ADD.W rd, rd, #32 (4 bytes at offset 26)
3611                let hw1: u16 = (0xF100 | rd_bits) as u16;
3612                let hw2: u16 = ((rd_bits << 8) | 0x20) as u16;
3613                bytes.extend_from_slice(&hw1.to_le_bytes());
3614                bytes.extend_from_slice(&hw2.to_le_bytes());
3615
3616                // .done: (offset 30)
3617                // i64.ctz returns i64, so clear high word: MOV rnhi, #0 (2 bytes)
3618                let mov0: u16 = (0x2000 | (rn_hi_bits << 8)) as u16;
3619                bytes.extend_from_slice(&mov0.to_le_bytes());
3620
3621                Ok(bytes)
3622            }
3623
3624            // I64Popcnt: Population count of 64-bit value
3625            // result = POPCNT(lo) + POPCNT(hi)
3626            // Using SIMD-style parallel bit counting algorithm
3627            ArmOp::I64Popcnt { rd, rnlo, rnhi } => {
3628                let rd_bits = reg_to_bits(rd);
3629                let rn_lo_bits = reg_to_bits(rnlo);
3630                let rn_hi_bits = reg_to_bits(rnhi);
3631                let r12: u32 = 12; // IP scratch
3632                let r3: u32 = 3; // Scratch for hi popcnt result
3633                let mut bytes = Vec::new();
3634
3635                // PUSH {R3, R4, R5} - save scratch registers
3636                bytes.extend_from_slice(&0xB438u16.to_le_bytes());
3637
3638                // Strategy: compute popcnt(lo) -> R4, popcnt(hi) -> R5, add them -> rd
3639                // Using lookup table approach for each byte would be too large
3640                // Using shift-and-add approach instead
3641
3642                // For simplicity and correctness, use the efficient parallel algorithm
3643                // but implement it as a series of inline operations
3644
3645                // MOV R4, rnlo
3646                let d_bit: u32 = 0; // R4 < 8, so high bit is 0
3647                let mov: u16 = (0x4600 | (d_bit << 7) | (rn_lo_bits << 3) | (4 & 0x7)) as u16;
3648                bytes.extend_from_slice(&mov.to_le_bytes());
3649
3650                // MOV R5, rnhi
3651                let d_bit: u32 = 0; // R5 < 8, so high bit is 0
3652                let mov: u16 = (0x4600 | (d_bit << 7) | (rn_hi_bits << 3) | (5 & 0x7)) as u16;
3653                bytes.extend_from_slice(&mov.to_le_bytes());
3654
3655                // --- POPCNT for R4 (lo word) ---
3656                // Step 1: x = x - ((x >> 1) & 0x55555555)
3657                // LSR.W R12, R4, #1
3658                let hw1: u16 = 0xEA4F;
3659                let hw2: u16 = ((r12 << 8) | 0x50 | 4) as u16;
3660                bytes.extend_from_slice(&hw1.to_le_bytes());
3661                bytes.extend_from_slice(&hw2.to_le_bytes());
3662
3663                // Load 0x55555555 into R3 using MOVW/MOVT
3664                // MOVW R3, #0x5555
3665                bytes.extend_from_slice(&0xF245u16.to_le_bytes());
3666                bytes.extend_from_slice(&0x5355u16.to_le_bytes());
3667                // MOVT R3, #0x5555
3668                bytes.extend_from_slice(&0xF2C5u16.to_le_bytes());
3669                bytes.extend_from_slice(&0x5355u16.to_le_bytes());
3670
3671                // AND.W R12, R12, R3
3672                let hw1: u16 = (0xEA00 | r12) as u16;
3673                let hw2: u16 = ((r12 << 8) | r3) as u16;
3674                bytes.extend_from_slice(&hw1.to_le_bytes());
3675                bytes.extend_from_slice(&hw2.to_le_bytes());
3676
3677                // SUB.W R4, R4, R12
3678                let hw1: u16 = (0xEBA0 | 4) as u16;
3679                let hw2: u16 = ((4 << 8) | r12) as u16;
3680                bytes.extend_from_slice(&hw1.to_le_bytes());
3681                bytes.extend_from_slice(&hw2.to_le_bytes());
3682
3683                // Step 2: x = (x & 0x33333333) + ((x >> 2) & 0x33333333)
3684                // Load 0x33333333 into R3
3685                // MOVW R3, #0x3333
3686                bytes.extend_from_slice(&0xF243u16.to_le_bytes());
3687                bytes.extend_from_slice(&0x3333u16.to_le_bytes());
3688                // MOVT R3, #0x3333
3689                bytes.extend_from_slice(&0xF2C3u16.to_le_bytes());
3690                bytes.extend_from_slice(&0x3333u16.to_le_bytes());
3691
3692                // AND.W R12, R4, R3
3693                let hw1: u16 = (0xEA00 | 4) as u16;
3694                let hw2: u16 = ((r12 << 8) | r3) as u16;
3695                bytes.extend_from_slice(&hw1.to_le_bytes());
3696                bytes.extend_from_slice(&hw2.to_le_bytes());
3697
3698                // LSR.W R4, R4, #2
3699                let hw1: u16 = 0xEA4F;
3700                let hw2: u16 = ((4 << 8) | 0x90 | 4) as u16;
3701                bytes.extend_from_slice(&hw1.to_le_bytes());
3702                bytes.extend_from_slice(&hw2.to_le_bytes());
3703
3704                // AND.W R4, R4, R3
3705                let hw1: u16 = (0xEA00 | 4) as u16;
3706                let hw2: u16 = ((4 << 8) | r3) as u16;
3707                bytes.extend_from_slice(&hw1.to_le_bytes());
3708                bytes.extend_from_slice(&hw2.to_le_bytes());
3709
3710                // ADD.W R4, R4, R12
3711                let hw1: u16 = (0xEB00 | 4) as u16;
3712                let hw2: u16 = ((4 << 8) | r12) as u16;
3713                bytes.extend_from_slice(&hw1.to_le_bytes());
3714                bytes.extend_from_slice(&hw2.to_le_bytes());
3715
3716                // Step 3: x = (x + (x >> 4)) & 0x0F0F0F0F
3717                // LSR.W R12, R4, #4
3718                // hw2 = (imm3 << 12) | (Rd << 8) | (imm2 << 6) | (type << 4) | Rm
3719                // imm5=4=00100 → imm3=1, imm2=0, type=01(LSR)
3720                let hw1: u16 = 0xEA4F;
3721                let hw2: u16 = (0x1000 | (r12 << 8) | 0x10 | 4) as u16;
3722                bytes.extend_from_slice(&hw1.to_le_bytes());
3723                bytes.extend_from_slice(&hw2.to_le_bytes());
3724
3725                // ADD.W R4, R4, R12
3726                let hw1: u16 = (0xEB00 | 4) as u16;
3727                let hw2: u16 = ((4 << 8) | r12) as u16;
3728                bytes.extend_from_slice(&hw1.to_le_bytes());
3729                bytes.extend_from_slice(&hw2.to_le_bytes());
3730
3731                // Load 0x0F0F0F0F into R3
3732                // MOVW R3, #0x0F0F (imm4=0, i=1, imm3=7, imm8=0x0F)
3733                // hw1 = 11110 1 10 0100 0000 = 0xF640
3734                // hw2 = 0 111 0011 00001111 = 0x730F
3735                bytes.extend_from_slice(&0xF640u16.to_le_bytes());
3736                bytes.extend_from_slice(&0x730Fu16.to_le_bytes());
3737                // MOVT R3, #0x0F0F
3738                bytes.extend_from_slice(&0xF6C0u16.to_le_bytes());
3739                bytes.extend_from_slice(&0x730Fu16.to_le_bytes());
3740
3741                // AND.W R4, R4, R3
3742                let hw1: u16 = (0xEA00 | 4) as u16;
3743                let hw2: u16 = ((4 << 8) | r3) as u16;
3744                bytes.extend_from_slice(&hw1.to_le_bytes());
3745                bytes.extend_from_slice(&hw2.to_le_bytes());
3746
3747                // Step 4: x = x * 0x01010101 >> 24
3748                // Load 0x01010101 into R3
3749                // MOVW R3, #0x0101
3750                bytes.extend_from_slice(&0xF240u16.to_le_bytes());
3751                bytes.extend_from_slice(&0x1301u16.to_le_bytes());
3752                // MOVT R3, #0x0101
3753                bytes.extend_from_slice(&0xF2C0u16.to_le_bytes());
3754                bytes.extend_from_slice(&0x1301u16.to_le_bytes());
3755
3756                // MUL R4, R4, R3
3757                // MUL T2: hw1 = 0xFB00|Rn, hw2 = 0xF000|(Rd<<8)|Rm
3758                let hw1: u16 = (0xFB00 | 4) as u16;
3759                let hw2: u16 = (0xF000 | (4 << 8) | r3) as u16;
3760                bytes.extend_from_slice(&hw1.to_le_bytes());
3761                bytes.extend_from_slice(&hw2.to_le_bytes());
3762
3763                // LSR.W R4, R4, #24
3764                // imm5=24=11000 → imm3=6, imm2=0, type=01(LSR)
3765                let hw1: u16 = 0xEA4F;
3766                let hw2: u16 = (0x6000 | (4 << 8) | 0x10 | 4) as u16;
3767                bytes.extend_from_slice(&hw1.to_le_bytes());
3768                bytes.extend_from_slice(&hw2.to_le_bytes());
3769
3770                // --- POPCNT for R5 (hi word) - same algorithm ---
3771                // Step 1
3772                let hw1: u16 = 0xEA4F;
3773                let hw2: u16 = ((r12 << 8) | 0x50 | 5) as u16;
3774                bytes.extend_from_slice(&hw1.to_le_bytes());
3775                bytes.extend_from_slice(&hw2.to_le_bytes());
3776
3777                // Load 0x55555555 into R3
3778                bytes.extend_from_slice(&0xF245u16.to_le_bytes());
3779                bytes.extend_from_slice(&0x5355u16.to_le_bytes());
3780                bytes.extend_from_slice(&0xF2C5u16.to_le_bytes());
3781                bytes.extend_from_slice(&0x5355u16.to_le_bytes());
3782
3783                let hw1: u16 = (0xEA00 | r12) as u16;
3784                let hw2: u16 = ((r12 << 8) | r3) as u16;
3785                bytes.extend_from_slice(&hw1.to_le_bytes());
3786                bytes.extend_from_slice(&hw2.to_le_bytes());
3787
3788                let hw1: u16 = (0xEBA0 | 5) as u16;
3789                let hw2: u16 = ((5 << 8) | r12) as u16;
3790                bytes.extend_from_slice(&hw1.to_le_bytes());
3791                bytes.extend_from_slice(&hw2.to_le_bytes());
3792
3793                // Step 2
3794                bytes.extend_from_slice(&0xF243u16.to_le_bytes());
3795                bytes.extend_from_slice(&0x3333u16.to_le_bytes());
3796                bytes.extend_from_slice(&0xF2C3u16.to_le_bytes());
3797                bytes.extend_from_slice(&0x3333u16.to_le_bytes());
3798
3799                let hw1: u16 = (0xEA00 | 5) as u16;
3800                let hw2: u16 = ((r12 << 8) | r3) as u16;
3801                bytes.extend_from_slice(&hw1.to_le_bytes());
3802                bytes.extend_from_slice(&hw2.to_le_bytes());
3803
3804                let hw1: u16 = 0xEA4F;
3805                let hw2: u16 = ((5 << 8) | 0x90 | 5) as u16;
3806                bytes.extend_from_slice(&hw1.to_le_bytes());
3807                bytes.extend_from_slice(&hw2.to_le_bytes());
3808
3809                let hw1: u16 = (0xEA00 | 5) as u16;
3810                let hw2: u16 = ((5 << 8) | r3) as u16;
3811                bytes.extend_from_slice(&hw1.to_le_bytes());
3812                bytes.extend_from_slice(&hw2.to_le_bytes());
3813
3814                let hw1: u16 = (0xEB00 | 5) as u16;
3815                let hw2: u16 = ((5 << 8) | r12) as u16;
3816                bytes.extend_from_slice(&hw1.to_le_bytes());
3817                bytes.extend_from_slice(&hw2.to_le_bytes());
3818
3819                // Step 3: LSR.W R12, R5, #4
3820                // imm5=4=00100 → imm3=1, imm2=0, type=01(LSR)
3821                let hw1: u16 = 0xEA4F;
3822                let hw2: u16 = (0x1000 | (r12 << 8) | 0x10 | 5) as u16;
3823                bytes.extend_from_slice(&hw1.to_le_bytes());
3824                bytes.extend_from_slice(&hw2.to_le_bytes());
3825
3826                let hw1: u16 = (0xEB00 | 5) as u16;
3827                let hw2: u16 = ((5 << 8) | r12) as u16;
3828                bytes.extend_from_slice(&hw1.to_le_bytes());
3829                bytes.extend_from_slice(&hw2.to_le_bytes());
3830
3831                // Load 0x0F0F0F0F into R3 (for hi-word)
3832                bytes.extend_from_slice(&0xF640u16.to_le_bytes());
3833                bytes.extend_from_slice(&0x730Fu16.to_le_bytes());
3834                bytes.extend_from_slice(&0xF6C0u16.to_le_bytes());
3835                bytes.extend_from_slice(&0x730Fu16.to_le_bytes());
3836
3837                let hw1: u16 = (0xEA00 | 5) as u16;
3838                let hw2: u16 = ((5 << 8) | r3) as u16;
3839                bytes.extend_from_slice(&hw1.to_le_bytes());
3840                bytes.extend_from_slice(&hw2.to_le_bytes());
3841
3842                // Step 4
3843                bytes.extend_from_slice(&0xF240u16.to_le_bytes());
3844                bytes.extend_from_slice(&0x1301u16.to_le_bytes());
3845                bytes.extend_from_slice(&0xF2C0u16.to_le_bytes());
3846                bytes.extend_from_slice(&0x1301u16.to_le_bytes());
3847
3848                // MUL R5, R5, R3
3849                // MUL T2: hw1 = 0xFB00|Rn, hw2 = 0xF000|(Rd<<8)|Rm
3850                let hw1: u16 = (0xFB00 | 5) as u16;
3851                let hw2: u16 = (0xF000 | (5 << 8) | r3) as u16;
3852                bytes.extend_from_slice(&hw1.to_le_bytes());
3853                bytes.extend_from_slice(&hw2.to_le_bytes());
3854
3855                // LSR.W R5, R5, #24
3856                // imm5=24=11000 → imm3=6, imm2=0, type=01(LSR)
3857                let hw1: u16 = 0xEA4F;
3858                let hw2: u16 = (0x6000 | (5 << 8) | 0x10 | 5) as u16;
3859                bytes.extend_from_slice(&hw1.to_le_bytes());
3860                bytes.extend_from_slice(&hw2.to_le_bytes());
3861
3862                // ADD rd, R4, R5 (combine lo and hi counts)
3863                // ADDS Rd, Rn, Rm (T1): 0001 100 Rm Rn Rd = 0x1800 | (Rm<<6) | (Rn<<3) | Rd
3864                let rd_bits_u16 = rd_bits as u16;
3865                let instr: u16 = 0x1800 | (5 << 6) | (4 << 3) | rd_bits_u16;
3866                bytes.extend_from_slice(&instr.to_le_bytes());
3867
3868                // POP {R3, R4, R5}
3869                bytes.extend_from_slice(&0xBC38u16.to_le_bytes());
3870
3871                // i64.popcnt returns i64, so clear high word: MOV rnhi, #0 (2 bytes)
3872                let mov0: u16 = (0x2000 | (rn_hi_bits << 8)) as u16;
3873                bytes.extend_from_slice(&mov0.to_le_bytes());
3874
3875                Ok(bytes)
3876            }
3877
3878            // I64Extend8S: Sign-extend low 8 bits to 64 bits
3879            // Result: rdlo = sign_extend_8(rnlo), rdhi = rdlo >> 31
3880            ArmOp::I64Extend8S { rdlo, rdhi, rnlo } => {
3881                let rdlo_bits = reg_to_bits(rdlo);
3882                let rdhi_bits = reg_to_bits(rdhi);
3883                let rnlo_bits = reg_to_bits(rnlo);
3884                let mut bytes = Vec::new();
3885
3886                // SXTB.W rdlo, rnlo (sign-extend byte to 32-bit)
3887                // SXTB T2: hw1 = 0xFA4F, hw2 = 0xF0<Rd><Rm>
3888                let hw1: u16 = 0xFA4F_u16;
3889                let hw2: u16 = (0xF080 | (rdlo_bits << 8) | rnlo_bits) as u16;
3890                bytes.extend_from_slice(&hw1.to_le_bytes());
3891                bytes.extend_from_slice(&hw2.to_le_bytes());
3892
3893                // ASR.W rdhi, rdlo, #31 (sign-extend to high word)
3894                // ASR (immediate): hw1 = 0xEA4F, hw2 = imm3:Rd:imm2:type:Rm
3895                // For imm5=31: imm3=111, imm2=11, type=10 (ASR)
3896                // hw2 = (7 << 12) | (rdhi << 8) | (3 << 6) | (2 << 4) | rdlo
3897                let hw1: u16 = 0xEA4F;
3898                let hw2: u16 = (0x70E0 | (rdhi_bits << 8) | rdlo_bits) as u16;
3899                bytes.extend_from_slice(&hw1.to_le_bytes());
3900                bytes.extend_from_slice(&hw2.to_le_bytes());
3901
3902                Ok(bytes)
3903            }
3904
3905            // I64Extend16S: Sign-extend low 16 bits to 64 bits
3906            // Result: rdlo = sign_extend_16(rnlo), rdhi = rdlo >> 31
3907            ArmOp::I64Extend16S { rdlo, rdhi, rnlo } => {
3908                let rdlo_bits = reg_to_bits(rdlo);
3909                let rdhi_bits = reg_to_bits(rdhi);
3910                let rnlo_bits = reg_to_bits(rnlo);
3911                let mut bytes = Vec::new();
3912
3913                // SXTH.W rdlo, rnlo (sign-extend halfword to 32-bit)
3914                // SXTH T2: hw1 = 0xFA0F, hw2 = 0xF0<Rd><Rm>
3915                let hw1: u16 = 0xFA0F_u16;
3916                let hw2: u16 = (0xF080 | (rdlo_bits << 8) | rnlo_bits) as u16;
3917                bytes.extend_from_slice(&hw1.to_le_bytes());
3918                bytes.extend_from_slice(&hw2.to_le_bytes());
3919
3920                // ASR.W rdhi, rdlo, #31 (sign-extend to high word)
3921                let hw1: u16 = 0xEA4F;
3922                let hw2: u16 = (0x70E0 | (rdhi_bits << 8) | rdlo_bits) as u16;
3923                bytes.extend_from_slice(&hw1.to_le_bytes());
3924                bytes.extend_from_slice(&hw2.to_le_bytes());
3925
3926                Ok(bytes)
3927            }
3928
3929            // I64Extend32S: Sign-extend low 32 bits to 64 bits
3930            // Result: rdlo = rnlo, rdhi = rnlo >> 31
3931            ArmOp::I64Extend32S { rdlo, rdhi, rnlo } => {
3932                let rdlo_bits = reg_to_bits(rdlo);
3933                let rdhi_bits = reg_to_bits(rdhi);
3934                let rnlo_bits = reg_to_bits(rnlo);
3935                let mut bytes = Vec::new();
3936
3937                // MOV rdlo, rnlo (if different)
3938                if rdlo_bits != rnlo_bits {
3939                    // MOV Rd, Rm (16-bit): 0100 0110 D Rm Rd[2:0]
3940                    let d_bit = ((rdlo_bits >> 3) & 1) as u16;
3941                    let mov: u16 = 0x4600
3942                        | (d_bit << 7)
3943                        | ((rnlo_bits as u16) << 3)
3944                        | ((rdlo_bits & 0x7) as u16);
3945                    bytes.extend_from_slice(&mov.to_le_bytes());
3946                }
3947
3948                // ASR.W rdhi, rnlo, #31 (sign-extend to high word)
3949                let hw1: u16 = 0xEA4F;
3950                let hw2: u16 = (0x70E0 | (rdhi_bits << 8) | rnlo_bits) as u16;
3951                bytes.extend_from_slice(&hw1.to_le_bytes());
3952                bytes.extend_from_slice(&hw2.to_le_bytes());
3953
3954                Ok(bytes)
3955            }
3956
3957            // SelectMove: IT <cond>; MOV{cond} rd, rm
3958            // Conditional move: only execute MOV if condition is true
3959            ArmOp::SelectMove { rd, rm, cond } => {
3960                let rd_bits = reg_to_bits(rd) as u16;
3961                let rm_bits = reg_to_bits(rm) as u16;
3962
3963                // Condition code encoding for IT block
3964                use synth_synthesis::Condition;
3965                let cond_bits: u16 = match cond {
3966                    Condition::EQ => 0x0, // Equal
3967                    Condition::NE => 0x1, // Not equal
3968                    Condition::HS => 0x2, // Higher or same (unsigned >=)
3969                    Condition::LO => 0x3, // Lower (unsigned <)
3970                    Condition::HI => 0x8, // Higher (unsigned >)
3971                    Condition::LS => 0x9, // Lower or same (unsigned <=)
3972                    Condition::GE => 0xA, // Greater or equal (signed)
3973                    Condition::LT => 0xB, // Less than (signed)
3974                    Condition::GT => 0xC, // Greater than (signed)
3975                    Condition::LE => 0xD, // Less or equal (signed)
3976                };
3977
3978                // IT <cond>: single Then block (mask = 0x8 for T only)
3979                // IT instruction: 1011 1111 firstcond mask
3980                let it_instr: u16 = 0xBF00 | (cond_bits << 4) | 0x8;
3981
3982                // MOV Rd, Rm (16-bit): 0100 0110 D Rm Rd[2:0]
3983                // This MOV will only execute if condition is true due to IT block
3984                let d_bit = (rd_bits >> 3) & 1;
3985                let mov_instr: u16 = 0x4600 | (d_bit << 7) | (rm_bits << 3) | (rd_bits & 0x7);
3986
3987                // Emit: IT <cond>, MOV rd, rm
3988                let mut bytes = it_instr.to_le_bytes().to_vec();
3989                bytes.extend_from_slice(&mov_instr.to_le_bytes());
3990                Ok(bytes)
3991            }
3992
3993            // Popcnt: Population count (count set bits)
3994            // ARM Cortex-M has no native POPCNT, so we implement the bit manipulation algorithm:
3995            // x = x - ((x >> 1) & 0x55555555);
3996            // x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
3997            // x = (x + (x >> 4)) & 0x0F0F0F0F;
3998            // x = x + (x >> 8);
3999            // x = x + (x >> 16);
4000            // return x & 0x3F;
4001            //
4002            // Uses rd as working register and R12 as scratch for constants
4003            ArmOp::Popcnt { rd, rm } => {
4004                let mut bytes = Vec::new();
4005
4006                // First, move rm to rd if they're different
4007                if rd != rm {
4008                    let rd_bits = reg_to_bits(rd) as u16;
4009                    let rm_bits = reg_to_bits(rm) as u16;
4010                    // MOV Rd, Rm (16-bit): 0100 0110 D Rm Rd[2:0]
4011                    let d_bit = (rd_bits >> 3) & 1;
4012                    let mov_instr: u16 = 0x4600 | (d_bit << 7) | (rm_bits << 3) | (rd_bits & 0x7);
4013                    bytes.extend_from_slice(&mov_instr.to_le_bytes());
4014                }
4015
4016                // Step 1: x = x - ((x >> 1) & 0x55555555)
4017                // Load 0x55555555 into R12
4018                bytes.extend_from_slice(&self.encode_thumb32_movw_raw(12, 0x5555)?);
4019                bytes.extend_from_slice(&self.encode_thumb32_movt_raw(12, 0x5555)?);
4020
4021                // R12_temp = rd >> 1
4022                // We need a second scratch register. Use R11.
4023                bytes.extend_from_slice(&self.encode_thumb32_lsr_raw(11, reg_to_bits(rd), 1)?);
4024
4025                // R11 = R11 & R12 (R11 = (x >> 1) & 0x55555555)
4026                bytes.extend_from_slice(&self.encode_thumb32_and_reg_raw(11, 11, 12)?);
4027
4028                // rd = rd - R11
4029                bytes.extend_from_slice(&self.encode_thumb32_sub_reg_raw(
4030                    reg_to_bits(rd),
4031                    reg_to_bits(rd),
4032                    11,
4033                )?);
4034
4035                // Step 2: x = (x & 0x33333333) + ((x >> 2) & 0x33333333)
4036                // Load 0x33333333 into R12
4037                bytes.extend_from_slice(&self.encode_thumb32_movw_raw(12, 0x3333)?);
4038                bytes.extend_from_slice(&self.encode_thumb32_movt_raw(12, 0x3333)?);
4039
4040                // R11 = rd & R12
4041                bytes.extend_from_slice(&self.encode_thumb32_and_reg_raw(
4042                    11,
4043                    reg_to_bits(rd),
4044                    12,
4045                )?);
4046
4047                // rd = rd >> 2
4048                bytes.extend_from_slice(&self.encode_thumb32_lsr_raw(
4049                    reg_to_bits(rd),
4050                    reg_to_bits(rd),
4051                    2,
4052                )?);
4053
4054                // rd = rd & R12
4055                bytes.extend_from_slice(&self.encode_thumb32_and_reg_raw(
4056                    reg_to_bits(rd),
4057                    reg_to_bits(rd),
4058                    12,
4059                )?);
4060
4061                // rd = rd + R11
4062                bytes.extend_from_slice(&self.encode_thumb32_add_reg_raw(
4063                    reg_to_bits(rd),
4064                    reg_to_bits(rd),
4065                    11,
4066                )?);
4067
4068                // Step 3: x = (x + (x >> 4)) & 0x0F0F0F0F
4069                // R11 = rd >> 4
4070                bytes.extend_from_slice(&self.encode_thumb32_lsr_raw(11, reg_to_bits(rd), 4)?);
4071
4072                // rd = rd + R11
4073                bytes.extend_from_slice(&self.encode_thumb32_add_reg_raw(
4074                    reg_to_bits(rd),
4075                    reg_to_bits(rd),
4076                    11,
4077                )?);
4078
4079                // Load 0x0F0F0F0F into R12
4080                bytes.extend_from_slice(&self.encode_thumb32_movw_raw(12, 0x0F0F)?);
4081                bytes.extend_from_slice(&self.encode_thumb32_movt_raw(12, 0x0F0F)?);
4082
4083                // rd = rd & R12
4084                bytes.extend_from_slice(&self.encode_thumb32_and_reg_raw(
4085                    reg_to_bits(rd),
4086                    reg_to_bits(rd),
4087                    12,
4088                )?);
4089
4090                // Step 4: x = x + (x >> 8)
4091                // R11 = rd >> 8
4092                bytes.extend_from_slice(&self.encode_thumb32_lsr_raw(11, reg_to_bits(rd), 8)?);
4093
4094                // rd = rd + R11
4095                bytes.extend_from_slice(&self.encode_thumb32_add_reg_raw(
4096                    reg_to_bits(rd),
4097                    reg_to_bits(rd),
4098                    11,
4099                )?);
4100
4101                // Step 5: x = x + (x >> 16)
4102                // R11 = rd >> 16
4103                bytes.extend_from_slice(&self.encode_thumb32_lsr_raw(11, reg_to_bits(rd), 16)?);
4104
4105                // rd = rd + R11
4106                bytes.extend_from_slice(&self.encode_thumb32_add_reg_raw(
4107                    reg_to_bits(rd),
4108                    reg_to_bits(rd),
4109                    11,
4110                )?);
4111
4112                // Step 6: return x & 0x3F
4113                // AND with 0x3F (small immediate, can use BIC or AND with immediate)
4114                bytes.extend_from_slice(&self.encode_thumb32_and_imm_raw(
4115                    reg_to_bits(rd),
4116                    reg_to_bits(rd),
4117                    0x3F,
4118                )?);
4119
4120                Ok(bytes)
4121            }
4122
4123            // I64DivU: 64-bit unsigned division using binary long division
4124            // Input: R0:R1 = dividend, R2:R3 = divisor
4125            // Output: R0:R1 = quotient
4126            // Uses: R4-R7, R12 as loop counter (avoid R8 for Renode compatibility)
4127            ArmOp::I64DivU {
4128                rdlo: _,
4129                rdhi: _,
4130                rnlo: _,
4131                rnhi: _,
4132                rmlo: _,
4133                rmhi: _,
4134            } => {
4135                let mut bytes = Vec::new();
4136
4137                // PUSH {R4-R7} - save scratch registers (NO LR — this is inline code)
4138                // 16-bit PUSH: 1011 010 M rrrrrrrr where M=0 (no LR), r=R4-R7 = 0xF0
4139                // Encoding: 1011 0100 1111 0000 = 0xB4F0
4140                bytes.extend_from_slice(&0xB4F0u16.to_le_bytes());
4141
4142                // Initialize quotient (R4:R5) = 0
4143                bytes.extend_from_slice(&0x2400u16.to_le_bytes()); // MOV R4, #0
4144                bytes.extend_from_slice(&0x2500u16.to_le_bytes()); // MOV R5, #0
4145
4146                // Initialize remainder (R6:R7) = 0
4147                bytes.extend_from_slice(&0x2600u16.to_le_bytes()); // MOV R6, #0
4148                bytes.extend_from_slice(&0x2700u16.to_le_bytes()); // MOV R7, #0
4149
4150                // Initialize loop counter R12 = 64 (use R12 scratch instead of R8)
4151                // MOV.W R12, #64: F04F 0C40
4152                bytes.extend_from_slice(&0xF04Fu16.to_le_bytes());
4153                bytes.extend_from_slice(&0x0C40u16.to_le_bytes());
4154
4155                // Loop start
4156                let loop_start = bytes.len();
4157
4158                // === Loop body: process one bit ===
4159
4160                // 1. Shift quotient R4:R5 left by 1
4161                // LSLS R5, R5, #1 (16-bit: 0000 0010 1010 1101 = 0x006D -> actually 0x002D for LSL R5,R5,#1)
4162                // LSL Rd, Rm, #imm5: 000 00 imm5 Rm Rd = 000 00 00001 101 101 = 0x006D
4163                bytes.extend_from_slice(&0x006Du16.to_le_bytes()); // LSLS R5, R5, #1
4164                // Get carry from R4 into R5: ORR R5, R5, R4 LSR #31
4165                // Thumb-2 ORR with shifted register: EA45 75D4 = ORR.W R5, R5, R4, LSR #31
4166                // 11101010 010 S Rn | 0 imm3 Rd imm2 type Rm
4167                // type=01 (LSR), imm5=31 (imm3=111, imm2=11)
4168                bytes.extend_from_slice(&0xEA45u16.to_le_bytes());
4169                bytes.extend_from_slice(&0x75D4u16.to_le_bytes()); // ORR.W R5, R5, R4, LSR #31
4170                // LSLS R4, R4, #1: 000 00 00001 100 100 = 0x0064
4171                bytes.extend_from_slice(&0x0064u16.to_le_bytes()); // LSLS R4, R4, #1
4172
4173                // 2. Shift remainder R6:R7 left by 1, OR in MSB of dividend R1
4174                // LSLS R7, R7, #1
4175                bytes.extend_from_slice(&0x007Fu16.to_le_bytes()); // LSLS R7, R7, #1
4176                // ORR.W R7, R7, R6, LSR #31
4177                bytes.extend_from_slice(&0xEA47u16.to_le_bytes());
4178                bytes.extend_from_slice(&0x77D6u16.to_le_bytes());
4179                // LSLS R6, R6, #1
4180                bytes.extend_from_slice(&0x0076u16.to_le_bytes()); // LSLS R6, R6, #1
4181                // ORR.W R6, R6, R1, LSR #31 (bring in MSB of dividend high)
4182                bytes.extend_from_slice(&0xEA46u16.to_le_bytes());
4183                bytes.extend_from_slice(&0x76D1u16.to_le_bytes());
4184
4185                // 3. Shift dividend R0:R1 left by 1
4186                // LSLS R1, R1, #1
4187                bytes.extend_from_slice(&0x0049u16.to_le_bytes()); // LSLS R1, R1, #1
4188                // ORR.W R1, R1, R0, LSR #31
4189                bytes.extend_from_slice(&0xEA41u16.to_le_bytes());
4190                bytes.extend_from_slice(&0x71D0u16.to_le_bytes());
4191                // LSLS R0, R0, #1
4192                bytes.extend_from_slice(&0x0040u16.to_le_bytes()); // LSLS R0, R0, #1
4193
4194                // 4. Compare remainder >= divisor (64-bit unsigned comparison)
4195                // Compare high words first: CMP R7, R3
4196                // CMP Rn, Rm encoding: 0x4280 | (Rm << 3) | Rn
4197                bytes.extend_from_slice(&0x429Fu16.to_le_bytes()); // CMP R7, R3 (16-bit)
4198                // BHI means R7 > R3 (unsigned) - definitely subtract
4199                // BLO means R7 < R3 - definitely don't subtract
4200                // BEQ means need to check low words
4201
4202                // If high > divisor high: branch to subtract (forward +offset)
4203                // BHI.N +6 (skip CMP, skip BLO, do subtract)
4204                // BHI: 1101 1000 offset8 where cond=1000 (HI)
4205                bytes.extend_from_slice(&0xD802u16.to_le_bytes()); // BHI +4 (to subtract block)
4206
4207                // If high < divisor high: branch past subtract
4208                // BLO.N +10 (skip to decrement)
4209                bytes.extend_from_slice(&0xD306u16.to_le_bytes()); // BLO/BCC +12 (past subtract)
4210
4211                // High words equal, compare low: CMP R6, R2
4212                bytes.extend_from_slice(&0x4296u16.to_le_bytes()); // CMP R6, R2 (16-bit)
4213                // BLO/BCC past subtract (skip SUBS+SBC.W+ORR.W = 10 bytes = 4 halfwords from PC+4)
4214                bytes.extend_from_slice(&0xD304u16.to_le_bytes()); // BCC +4 halfwords (past subtract)
4215
4216                // === Subtract block: remainder -= divisor, quotient |= 1 ===
4217                // SUBS R6, R6, R2
4218                bytes.extend_from_slice(&0x1AB6u16.to_le_bytes()); // SUBS R6, R6, R2 (16-bit)
4219                // SBC R7, R7, R3 (with borrow)
4220                // Thumb-2 SBC.W: EB67 0703 = SBC.W R7, R7, R3
4221                bytes.extend_from_slice(&0xEB67u16.to_le_bytes());
4222                bytes.extend_from_slice(&0x0703u16.to_le_bytes());
4223                // ORR R4, R4, #1 (set bit 0 of quotient low)
4224                bytes.extend_from_slice(&0xF044u16.to_le_bytes()); // ORR.W R4, R4, #1
4225                bytes.extend_from_slice(&0x0401u16.to_le_bytes());
4226
4227                // === Decrement counter and loop ===
4228                // SUBS.W R12, R12, #1 (decrement loop counter)
4229                // SUBS.W R12, R12, #1: F1BC 0C01
4230                bytes.extend_from_slice(&0xF1BCu16.to_le_bytes());
4231                bytes.extend_from_slice(&0x0C01u16.to_le_bytes());
4232
4233                // BNE back to loop_start
4234                let branch_offset_bytes = bytes.len() - loop_start + 4; // +4 for pipeline
4235                let offset_halfwords = -((branch_offset_bytes / 2) as i16);
4236                let bne_encoding = 0xD100u16 | ((offset_halfwords as u16) & 0xFF);
4237                bytes.extend_from_slice(&bne_encoding.to_le_bytes());
4238
4239                // === Loop done, move quotient to R0:R1 ===
4240                bytes.extend_from_slice(&0x4620u16.to_le_bytes()); // MOV R0, R4
4241                bytes.extend_from_slice(&0x4629u16.to_le_bytes()); // MOV R1, R5
4242
4243                // POP {R4-R7} - restore scratch registers (NO PC — inline code continues)
4244                // 16-bit POP: 1011 110 P rrrrrrrr where P=0 (no PC), r=R4-R7 = 0xF0
4245                // Encoding: 1011 1100 1111 0000 = 0xBCF0
4246                bytes.extend_from_slice(&0xBCF0u16.to_le_bytes());
4247
4248                Ok(bytes)
4249            }
4250
4251            // I64DivS: 64-bit signed division
4252            // Converts to unsigned, divides, then applies sign
4253            // Input: R0:R1 = dividend (signed), R2:R3 = divisor (signed)
4254            // Output: R0:R1 = quotient (signed)
4255            ArmOp::I64DivS {
4256                rdlo: _,
4257                rdhi: _,
4258                rnlo: _,
4259                rnhi: _,
4260                rmlo: _,
4261                rmhi: _,
4262            } => {
4263                let mut bytes = Vec::new();
4264
4265                // PUSH {R4-R11} - save scratch registers (NO LR — inline code)
4266                bytes.extend_from_slice(&0xE92Du16.to_le_bytes());
4267                bytes.extend_from_slice(&0x0FF0u16.to_le_bytes());
4268
4269                // Save result sign in R9: R9 = R1 XOR R3 (sign bit = MSB)
4270                // EOR.W R9, R1, R3
4271                bytes.extend_from_slice(&0xEA81u16.to_le_bytes());
4272                bytes.extend_from_slice(&0x0903u16.to_le_bytes());
4273
4274                // If dividend negative (R1 MSB set), negate it
4275                // TST R1, R1 (check sign)
4276                bytes.extend_from_slice(&0x4209u16.to_le_bytes()); // TST R1, R1
4277                // BPL skip_neg_dividend (+10 bytes = 5 halfwords)
4278                bytes.extend_from_slice(&0xD504u16.to_le_bytes()); // BPL +8
4279
4280                // Negate R0:R1 (64-bit): RSBS R0, R0, #0; SBC R1, R1, R1 LSL #1
4281                // Actually: MVN R0, R0; MVN R1, R1; ADDS R0, R0, #1; ADC R1, R1, #0
4282                bytes.extend_from_slice(&0x43C0u16.to_le_bytes()); // MVNS R0, R0
4283                bytes.extend_from_slice(&0x43C9u16.to_le_bytes()); // MVNS R1, R1
4284                bytes.extend_from_slice(&0x1C40u16.to_le_bytes()); // ADDS R0, R0, #1
4285                bytes.extend_from_slice(&0xF141u16.to_le_bytes()); // ADC.W R1, R1, #0
4286                bytes.extend_from_slice(&0x0100u16.to_le_bytes());
4287
4288                // If divisor negative (R3 MSB set), negate it
4289                bytes.extend_from_slice(&0x421Bu16.to_le_bytes()); // TST R3, R3
4290                bytes.extend_from_slice(&0xD504u16.to_le_bytes()); // BPL +8
4291
4292                // Negate R2:R3
4293                bytes.extend_from_slice(&0x43D2u16.to_le_bytes()); // MVNS R2, R2
4294                bytes.extend_from_slice(&0x43DBu16.to_le_bytes()); // MVNS R3, R3
4295                bytes.extend_from_slice(&0x1C52u16.to_le_bytes()); // ADDS R2, R2, #1
4296                bytes.extend_from_slice(&0xF143u16.to_le_bytes()); // ADC.W R3, R3, #0
4297                bytes.extend_from_slice(&0x0300u16.to_le_bytes());
4298
4299                // === Now do unsigned division (same as I64DivU) ===
4300                // Initialize quotient (R4:R5) = 0
4301                bytes.extend_from_slice(&0x2400u16.to_le_bytes());
4302                bytes.extend_from_slice(&0x2500u16.to_le_bytes());
4303                // Initialize remainder (R6:R7) = 0
4304                bytes.extend_from_slice(&0x2600u16.to_le_bytes());
4305                bytes.extend_from_slice(&0x2700u16.to_le_bytes());
4306                // Initialize loop counter R8 = 64
4307                bytes.extend_from_slice(&0xF04Fu16.to_le_bytes());
4308                bytes.extend_from_slice(&0x0840u16.to_le_bytes());
4309
4310                let loop_start = bytes.len();
4311
4312                // Shift quotient left
4313                bytes.extend_from_slice(&0x006Du16.to_le_bytes()); // LSLS R5, R5, #1
4314                bytes.extend_from_slice(&0xEA45u16.to_le_bytes()); // ORR.W R5, R5, R4, LSR #31
4315                bytes.extend_from_slice(&0x75D4u16.to_le_bytes());
4316                bytes.extend_from_slice(&0x0064u16.to_le_bytes()); // LSLS R4, R4, #1
4317
4318                // Shift remainder left, OR in MSB of dividend
4319                bytes.extend_from_slice(&0x007Fu16.to_le_bytes()); // LSLS R7, R7, #1
4320                bytes.extend_from_slice(&0xEA47u16.to_le_bytes()); // ORR.W R7, R7, R6, LSR #31
4321                bytes.extend_from_slice(&0x77D6u16.to_le_bytes());
4322                bytes.extend_from_slice(&0x0076u16.to_le_bytes()); // LSLS R6, R6, #1
4323                bytes.extend_from_slice(&0xEA46u16.to_le_bytes()); // ORR.W R6, R6, R1, LSR #31
4324                bytes.extend_from_slice(&0x76D1u16.to_le_bytes());
4325
4326                // Shift dividend left
4327                bytes.extend_from_slice(&0x0049u16.to_le_bytes()); // LSLS R1, R1, #1
4328                bytes.extend_from_slice(&0xEA41u16.to_le_bytes()); // ORR.W R1, R1, R0, LSR #31
4329                bytes.extend_from_slice(&0x71D0u16.to_le_bytes());
4330                bytes.extend_from_slice(&0x0040u16.to_le_bytes()); // LSLS R0, R0, #1
4331
4332                // Compare and conditionally subtract
4333                bytes.extend_from_slice(&0x429Fu16.to_le_bytes()); // CMP R7, R3
4334                bytes.extend_from_slice(&0xD802u16.to_le_bytes()); // BHI +4
4335                bytes.extend_from_slice(&0xD306u16.to_le_bytes()); // BCC +12
4336                bytes.extend_from_slice(&0x4296u16.to_le_bytes()); // CMP R6, R2
4337                bytes.extend_from_slice(&0xD304u16.to_le_bytes()); // BCC +4 halfwords
4338
4339                // Subtract and set quotient bit
4340                bytes.extend_from_slice(&0x1AB6u16.to_le_bytes()); // SUBS R6, R6, R2
4341                bytes.extend_from_slice(&0xEB67u16.to_le_bytes()); // SBC.W R7, R7, R3
4342                bytes.extend_from_slice(&0x0703u16.to_le_bytes());
4343                bytes.extend_from_slice(&0xF044u16.to_le_bytes()); // ORR.W R4, R4, #1
4344                bytes.extend_from_slice(&0x0401u16.to_le_bytes());
4345
4346                // Decrement and loop
4347                bytes.extend_from_slice(&0xF1B8u16.to_le_bytes()); // SUB.W R8, R8, #1
4348                bytes.extend_from_slice(&0x0801u16.to_le_bytes());
4349
4350                let branch_offset_bytes = bytes.len() - loop_start + 4;
4351                let offset_halfwords = -((branch_offset_bytes / 2) as i16);
4352                let bne_encoding = 0xD100u16 | ((offset_halfwords as u16) & 0xFF);
4353                bytes.extend_from_slice(&bne_encoding.to_le_bytes());
4354
4355                // Move quotient to R0:R1
4356                bytes.extend_from_slice(&0x4620u16.to_le_bytes()); // MOV R0, R4
4357                bytes.extend_from_slice(&0x4629u16.to_le_bytes()); // MOV R1, R5
4358
4359                // If result should be negative (R9 MSB set), negate R0:R1
4360                bytes.extend_from_slice(&0xF1B9u16.to_le_bytes()); // TST.W R9, R9 (check MSB)
4361                bytes.extend_from_slice(&0x0F00u16.to_le_bytes());
4362                bytes.extend_from_slice(&0xD504u16.to_le_bytes()); // BPL +8 (skip negation)
4363
4364                // Negate result R0:R1
4365                bytes.extend_from_slice(&0x43C0u16.to_le_bytes()); // MVNS R0, R0
4366                bytes.extend_from_slice(&0x43C9u16.to_le_bytes()); // MVNS R1, R1
4367                bytes.extend_from_slice(&0x1C40u16.to_le_bytes()); // ADDS R0, R0, #1
4368                bytes.extend_from_slice(&0xF141u16.to_le_bytes()); // ADC.W R1, R1, #0
4369                bytes.extend_from_slice(&0x0100u16.to_le_bytes());
4370
4371                // POP {R4-R11} - restore scratch registers (NO PC — inline code continues)
4372                bytes.extend_from_slice(&0xE8BDu16.to_le_bytes());
4373                bytes.extend_from_slice(&0x0FF0u16.to_le_bytes());
4374
4375                Ok(bytes)
4376            }
4377
4378            // I64RemU: 64-bit unsigned remainder using binary long division
4379            // Same algorithm as I64DivU but returns remainder instead of quotient
4380            // Input: R0:R1 = dividend, R2:R3 = divisor
4381            // Output: R0:R1 = remainder
4382            ArmOp::I64RemU {
4383                rdlo: _,
4384                rdhi: _,
4385                rnlo: _,
4386                rnhi: _,
4387                rmlo: _,
4388                rmhi: _,
4389            } => {
4390                let mut bytes = Vec::new();
4391
4392                // PUSH {R4-R8} - save scratch registers (NO LR — inline code)
4393                bytes.extend_from_slice(&0xE92Du16.to_le_bytes());
4394                bytes.extend_from_slice(&0x01F0u16.to_le_bytes());
4395
4396                // Initialize quotient (R4:R5) = 0 (computed but not returned)
4397                bytes.extend_from_slice(&0x2400u16.to_le_bytes());
4398                bytes.extend_from_slice(&0x2500u16.to_le_bytes());
4399                // Initialize remainder (R6:R7) = 0
4400                bytes.extend_from_slice(&0x2600u16.to_le_bytes());
4401                bytes.extend_from_slice(&0x2700u16.to_le_bytes());
4402                // Initialize loop counter R8 = 64
4403                bytes.extend_from_slice(&0xF04Fu16.to_le_bytes());
4404                bytes.extend_from_slice(&0x0840u16.to_le_bytes());
4405
4406                let loop_start = bytes.len();
4407
4408                // Shift quotient left (not needed for result, but keeps algorithm same)
4409                bytes.extend_from_slice(&0x006Du16.to_le_bytes()); // LSLS R5, R5, #1
4410                bytes.extend_from_slice(&0xEA45u16.to_le_bytes()); // ORR.W R5, R5, R4, LSR #31
4411                bytes.extend_from_slice(&0x75D4u16.to_le_bytes());
4412                bytes.extend_from_slice(&0x0064u16.to_le_bytes()); // LSLS R4, R4, #1
4413
4414                // Shift remainder left, OR in MSB of dividend
4415                bytes.extend_from_slice(&0x007Fu16.to_le_bytes()); // LSLS R7, R7, #1
4416                bytes.extend_from_slice(&0xEA47u16.to_le_bytes()); // ORR.W R7, R7, R6, LSR #31
4417                bytes.extend_from_slice(&0x77D6u16.to_le_bytes());
4418                bytes.extend_from_slice(&0x0076u16.to_le_bytes()); // LSLS R6, R6, #1
4419                bytes.extend_from_slice(&0xEA46u16.to_le_bytes()); // ORR.W R6, R6, R1, LSR #31
4420                bytes.extend_from_slice(&0x76D1u16.to_le_bytes());
4421
4422                // Shift dividend left
4423                bytes.extend_from_slice(&0x0049u16.to_le_bytes()); // LSLS R1, R1, #1
4424                bytes.extend_from_slice(&0xEA41u16.to_le_bytes()); // ORR.W R1, R1, R0, LSR #31
4425                bytes.extend_from_slice(&0x71D0u16.to_le_bytes());
4426                bytes.extend_from_slice(&0x0040u16.to_le_bytes()); // LSLS R0, R0, #1
4427
4428                // Compare and conditionally subtract
4429                bytes.extend_from_slice(&0x429Fu16.to_le_bytes()); // CMP R7, R3
4430                bytes.extend_from_slice(&0xD802u16.to_le_bytes()); // BHI +4
4431                bytes.extend_from_slice(&0xD306u16.to_le_bytes()); // BCC +12
4432                bytes.extend_from_slice(&0x4296u16.to_le_bytes()); // CMP R6, R2
4433                bytes.extend_from_slice(&0xD304u16.to_le_bytes()); // BCC +4 halfwords
4434
4435                // Subtract and set quotient bit
4436                bytes.extend_from_slice(&0x1AB6u16.to_le_bytes()); // SUBS R6, R6, R2
4437                bytes.extend_from_slice(&0xEB67u16.to_le_bytes()); // SBC.W R7, R7, R3
4438                bytes.extend_from_slice(&0x0703u16.to_le_bytes());
4439                bytes.extend_from_slice(&0xF044u16.to_le_bytes()); // ORR.W R4, R4, #1
4440                bytes.extend_from_slice(&0x0401u16.to_le_bytes());
4441
4442                // Decrement and loop
4443                bytes.extend_from_slice(&0xF1B8u16.to_le_bytes()); // SUB.W R8, R8, #1
4444                bytes.extend_from_slice(&0x0801u16.to_le_bytes());
4445
4446                let branch_offset_bytes = bytes.len() - loop_start + 4;
4447                let offset_halfwords = -((branch_offset_bytes / 2) as i16);
4448                let bne_encoding = 0xD100u16 | ((offset_halfwords as u16) & 0xFF);
4449                bytes.extend_from_slice(&bne_encoding.to_le_bytes());
4450
4451                // Move REMAINDER to R0:R1 (difference from I64DivU)
4452                bytes.extend_from_slice(&0x4630u16.to_le_bytes()); // MOV R0, R6
4453                bytes.extend_from_slice(&0x4639u16.to_le_bytes()); // MOV R1, R7
4454
4455                // POP {R4-R8} - restore scratch registers (NO PC — inline code continues)
4456                bytes.extend_from_slice(&0xE8BDu16.to_le_bytes());
4457                bytes.extend_from_slice(&0x01F0u16.to_le_bytes());
4458
4459                Ok(bytes)
4460            }
4461
4462            // I64RemS: 64-bit signed remainder
4463            // Remainder sign follows dividend sign (not quotient rule)
4464            // Input: R0:R1 = dividend (signed), R2:R3 = divisor (signed)
4465            // Output: R0:R1 = remainder (signed, same sign as dividend)
4466            ArmOp::I64RemS {
4467                rdlo: _,
4468                rdhi: _,
4469                rnlo: _,
4470                rnhi: _,
4471                rmlo: _,
4472                rmhi: _,
4473            } => {
4474                let mut bytes = Vec::new();
4475
4476                // PUSH {R4-R11} - save scratch registers (NO LR — inline code)
4477                bytes.extend_from_slice(&0xE92Du16.to_le_bytes());
4478                bytes.extend_from_slice(&0x0FF0u16.to_le_bytes());
4479
4480                // Save dividend sign in R9 (remainder sign = dividend sign)
4481                // MOV R9, R1 (just need the sign bit)
4482                bytes.extend_from_slice(&0x4689u16.to_le_bytes()); // MOV R9, R1
4483
4484                // If dividend negative (R1 MSB set), negate it
4485                bytes.extend_from_slice(&0x4209u16.to_le_bytes()); // TST R1, R1
4486                bytes.extend_from_slice(&0xD504u16.to_le_bytes()); // BPL +8
4487
4488                // Negate R0:R1
4489                bytes.extend_from_slice(&0x43C0u16.to_le_bytes()); // MVNS R0, R0
4490                bytes.extend_from_slice(&0x43C9u16.to_le_bytes()); // MVNS R1, R1
4491                bytes.extend_from_slice(&0x1C40u16.to_le_bytes()); // ADDS R0, R0, #1
4492                bytes.extend_from_slice(&0xF141u16.to_le_bytes()); // ADC.W R1, R1, #0
4493                bytes.extend_from_slice(&0x0100u16.to_le_bytes());
4494
4495                // If divisor negative (R3 MSB set), negate it
4496                bytes.extend_from_slice(&0x421Bu16.to_le_bytes()); // TST R3, R3
4497                bytes.extend_from_slice(&0xD504u16.to_le_bytes()); // BPL +8
4498
4499                // Negate R2:R3
4500                bytes.extend_from_slice(&0x43D2u16.to_le_bytes()); // MVNS R2, R2
4501                bytes.extend_from_slice(&0x43DBu16.to_le_bytes()); // MVNS R3, R3
4502                bytes.extend_from_slice(&0x1C52u16.to_le_bytes()); // ADDS R2, R2, #1
4503                bytes.extend_from_slice(&0xF143u16.to_le_bytes()); // ADC.W R3, R3, #0
4504                bytes.extend_from_slice(&0x0300u16.to_le_bytes());
4505
4506                // === Unsigned division algorithm ===
4507                // Initialize quotient (R4:R5) = 0
4508                bytes.extend_from_slice(&0x2400u16.to_le_bytes());
4509                bytes.extend_from_slice(&0x2500u16.to_le_bytes());
4510                // Initialize remainder (R6:R7) = 0
4511                bytes.extend_from_slice(&0x2600u16.to_le_bytes());
4512                bytes.extend_from_slice(&0x2700u16.to_le_bytes());
4513                // Initialize loop counter R8 = 64
4514                bytes.extend_from_slice(&0xF04Fu16.to_le_bytes());
4515                bytes.extend_from_slice(&0x0840u16.to_le_bytes());
4516
4517                let loop_start = bytes.len();
4518
4519                // Shift quotient left
4520                bytes.extend_from_slice(&0x006Du16.to_le_bytes()); // LSLS R5, R5, #1
4521                bytes.extend_from_slice(&0xEA45u16.to_le_bytes()); // ORR.W R5, R5, R4, LSR #31
4522                bytes.extend_from_slice(&0x75D4u16.to_le_bytes());
4523                bytes.extend_from_slice(&0x0064u16.to_le_bytes()); // LSLS R4, R4, #1
4524
4525                // Shift remainder left, OR in MSB of dividend
4526                bytes.extend_from_slice(&0x007Fu16.to_le_bytes()); // LSLS R7, R7, #1
4527                bytes.extend_from_slice(&0xEA47u16.to_le_bytes()); // ORR.W R7, R7, R6, LSR #31
4528                bytes.extend_from_slice(&0x77D6u16.to_le_bytes());
4529                bytes.extend_from_slice(&0x0076u16.to_le_bytes()); // LSLS R6, R6, #1
4530                bytes.extend_from_slice(&0xEA46u16.to_le_bytes()); // ORR.W R6, R6, R1, LSR #31
4531                bytes.extend_from_slice(&0x76D1u16.to_le_bytes());
4532
4533                // Shift dividend left
4534                bytes.extend_from_slice(&0x0049u16.to_le_bytes()); // LSLS R1, R1, #1
4535                bytes.extend_from_slice(&0xEA41u16.to_le_bytes()); // ORR.W R1, R1, R0, LSR #31
4536                bytes.extend_from_slice(&0x71D0u16.to_le_bytes());
4537                bytes.extend_from_slice(&0x0040u16.to_le_bytes()); // LSLS R0, R0, #1
4538
4539                // Compare and conditionally subtract
4540                bytes.extend_from_slice(&0x429Fu16.to_le_bytes()); // CMP R7, R3
4541                bytes.extend_from_slice(&0xD802u16.to_le_bytes()); // BHI +4
4542                bytes.extend_from_slice(&0xD306u16.to_le_bytes()); // BCC +12
4543                bytes.extend_from_slice(&0x4296u16.to_le_bytes()); // CMP R6, R2
4544                bytes.extend_from_slice(&0xD304u16.to_le_bytes()); // BCC +4 halfwords
4545
4546                // Subtract and set quotient bit
4547                bytes.extend_from_slice(&0x1AB6u16.to_le_bytes()); // SUBS R6, R6, R2
4548                bytes.extend_from_slice(&0xEB67u16.to_le_bytes()); // SBC.W R7, R7, R3
4549                bytes.extend_from_slice(&0x0703u16.to_le_bytes());
4550                bytes.extend_from_slice(&0xF044u16.to_le_bytes()); // ORR.W R4, R4, #1
4551                bytes.extend_from_slice(&0x0401u16.to_le_bytes());
4552
4553                // Decrement and loop
4554                bytes.extend_from_slice(&0xF1B8u16.to_le_bytes()); // SUB.W R8, R8, #1
4555                bytes.extend_from_slice(&0x0801u16.to_le_bytes());
4556
4557                let branch_offset_bytes = bytes.len() - loop_start + 4;
4558                let offset_halfwords = -((branch_offset_bytes / 2) as i16);
4559                let bne_encoding = 0xD100u16 | ((offset_halfwords as u16) & 0xFF);
4560                bytes.extend_from_slice(&bne_encoding.to_le_bytes());
4561
4562                // Move remainder to R0:R1
4563                bytes.extend_from_slice(&0x4630u16.to_le_bytes()); // MOV R0, R6
4564                bytes.extend_from_slice(&0x4639u16.to_le_bytes()); // MOV R1, R7
4565
4566                // If original dividend was negative (R9 MSB set), negate remainder
4567                bytes.extend_from_slice(&0xF1B9u16.to_le_bytes()); // TST.W R9, R9
4568                bytes.extend_from_slice(&0x0F00u16.to_le_bytes());
4569                bytes.extend_from_slice(&0xD504u16.to_le_bytes()); // BPL +8
4570
4571                // Negate result R0:R1
4572                bytes.extend_from_slice(&0x43C0u16.to_le_bytes()); // MVNS R0, R0
4573                bytes.extend_from_slice(&0x43C9u16.to_le_bytes()); // MVNS R1, R1
4574                bytes.extend_from_slice(&0x1C40u16.to_le_bytes()); // ADDS R0, R0, #1
4575                bytes.extend_from_slice(&0xF141u16.to_le_bytes()); // ADC.W R1, R1, #0
4576                bytes.extend_from_slice(&0x0100u16.to_le_bytes());
4577
4578                // POP {R4-R11} - restore scratch registers (NO PC — inline code continues)
4579                bytes.extend_from_slice(&0xE8BDu16.to_le_bytes());
4580                bytes.extend_from_slice(&0x0FF0u16.to_le_bytes());
4581
4582                Ok(bytes)
4583            }
4584
4585            // === F32 VFP single-precision Thumb-2 encodings ===
4586            // VFP instruction words are identical to ARM32; emit as two LE halfwords.
4587            ArmOp::F32Add { sd, sn, sm } => {
4588                Ok(vfp_to_thumb_bytes(encode_vfp_3reg(0xEE300A00, sd, sn, sm)?))
4589            }
4590            ArmOp::F32Sub { sd, sn, sm } => {
4591                Ok(vfp_to_thumb_bytes(encode_vfp_3reg(0xEE300A40, sd, sn, sm)?))
4592            }
4593            ArmOp::F32Mul { sd, sn, sm } => {
4594                Ok(vfp_to_thumb_bytes(encode_vfp_3reg(0xEE200A00, sd, sn, sm)?))
4595            }
4596            ArmOp::F32Div { sd, sn, sm } => {
4597                Ok(vfp_to_thumb_bytes(encode_vfp_3reg(0xEE800A00, sd, sn, sm)?))
4598            }
4599            ArmOp::F32Abs { sd, sm } => {
4600                Ok(vfp_to_thumb_bytes(encode_vfp_2reg(0xEEB00AC0, sd, sm)?))
4601            }
4602            ArmOp::F32Neg { sd, sm } => {
4603                Ok(vfp_to_thumb_bytes(encode_vfp_2reg(0xEEB10A40, sd, sm)?))
4604            }
4605            ArmOp::F32Sqrt { sd, sm } => {
4606                Ok(vfp_to_thumb_bytes(encode_vfp_2reg(0xEEB10AC0, sd, sm)?))
4607            }
4608
4609            // f32 pseudo-ops — multi-instruction sequences
4610            // FPSCR RMode: 00=nearest, 01=+inf(ceil), 10=-inf(floor), 11=zero(trunc)
4611            ArmOp::F32Ceil { sd, sm } => self.encode_thumb_f32_rounding(sd, sm, 0b01),
4612            ArmOp::F32Floor { sd, sm } => self.encode_thumb_f32_rounding(sd, sm, 0b10),
4613            ArmOp::F32Trunc { sd, sm } => self.encode_thumb_f32_rounding(sd, sm, 0b11),
4614            ArmOp::F32Nearest { sd, sm } => self.encode_thumb_f32_rounding(sd, sm, 0b00),
4615            ArmOp::F32Min { sd, sn, sm } => self.encode_thumb_f32_minmax(sd, sn, sm, true),
4616            ArmOp::F32Max { sd, sn, sm } => self.encode_thumb_f32_minmax(sd, sn, sm, false),
4617            ArmOp::F32Copysign { sd, sn, sm } => self.encode_thumb_f32_copysign(sd, sn, sm),
4618
4619            // f32 comparisons — VCMP + VMRS + MOV #0 + IT + MOV #1
4620            ArmOp::F32Eq { rd, sn, sm } => self.encode_thumb_f32_compare(rd, sn, sm, 0x0),
4621            ArmOp::F32Ne { rd, sn, sm } => self.encode_thumb_f32_compare(rd, sn, sm, 0x1),
4622            ArmOp::F32Lt { rd, sn, sm } => self.encode_thumb_f32_compare(rd, sn, sm, 0x4),
4623            ArmOp::F32Le { rd, sn, sm } => self.encode_thumb_f32_compare(rd, sn, sm, 0x9),
4624            ArmOp::F32Gt { rd, sn, sm } => self.encode_thumb_f32_compare(rd, sn, sm, 0xC),
4625            ArmOp::F32Ge { rd, sn, sm } => self.encode_thumb_f32_compare(rd, sn, sm, 0xA),
4626
4627            ArmOp::F32Const { sd, value } => self.encode_thumb_f32_const(sd, *value),
4628
4629            ArmOp::F32Load { sd, addr } => {
4630                Ok(vfp_to_thumb_bytes(encode_vfp_ldst(0xED900A00, sd, addr)?))
4631            }
4632            ArmOp::F32Store { sd, addr } => {
4633                Ok(vfp_to_thumb_bytes(encode_vfp_ldst(0xED800A00, sd, addr)?))
4634            }
4635
4636            ArmOp::F32ConvertI32S { sd, rm } => self.encode_thumb_f32_convert_i32(sd, rm, true),
4637            ArmOp::F32ConvertI32U { sd, rm } => self.encode_thumb_f32_convert_i32(sd, rm, false),
4638            ArmOp::F32ConvertI64S { .. } | ArmOp::F32ConvertI64U { .. } => {
4639                Err(synth_core::Error::synthesis(
4640                    "F32 i64 conversion not supported (requires register pairs on 32-bit ARM)",
4641                ))
4642            }
4643            ArmOp::F32ReinterpretI32 { sd, rm } => {
4644                Ok(vfp_to_thumb_bytes(encode_vmov_core_sreg(true, sd, rm)?))
4645            }
4646            ArmOp::I32ReinterpretF32 { rd, sm } => {
4647                Ok(vfp_to_thumb_bytes(encode_vmov_core_sreg(false, sm, rd)?))
4648            }
4649            ArmOp::I32TruncF32S { rd, sm } => self.encode_thumb_i32_trunc_f32(rd, sm, true),
4650            ArmOp::I32TruncF32U { rd, sm } => self.encode_thumb_i32_trunc_f32(rd, sm, false),
4651
4652            // === F64 VFP double-precision Thumb-2 encodings ===
4653            // VFP instruction words are identical to ARM32; emit as two LE halfwords.
4654            ArmOp::F64Add { dd, dn, dm } => Ok(vfp_to_thumb_bytes(encode_vfp_3reg_f64(
4655                0xEE300B00, dd, dn, dm,
4656            )?)),
4657            ArmOp::F64Sub { dd, dn, dm } => Ok(vfp_to_thumb_bytes(encode_vfp_3reg_f64(
4658                0xEE300B40, dd, dn, dm,
4659            )?)),
4660            ArmOp::F64Mul { dd, dn, dm } => Ok(vfp_to_thumb_bytes(encode_vfp_3reg_f64(
4661                0xEE200B00, dd, dn, dm,
4662            )?)),
4663            ArmOp::F64Div { dd, dn, dm } => Ok(vfp_to_thumb_bytes(encode_vfp_3reg_f64(
4664                0xEE800B00, dd, dn, dm,
4665            )?)),
4666            ArmOp::F64Abs { dd, dm } => {
4667                Ok(vfp_to_thumb_bytes(encode_vfp_2reg_f64(0xEEB00BC0, dd, dm)?))
4668            }
4669            ArmOp::F64Neg { dd, dm } => {
4670                Ok(vfp_to_thumb_bytes(encode_vfp_2reg_f64(0xEEB10B40, dd, dm)?))
4671            }
4672            ArmOp::F64Sqrt { dd, dm } => {
4673                Ok(vfp_to_thumb_bytes(encode_vfp_2reg_f64(0xEEB10BC0, dd, dm)?))
4674            }
4675
4676            // f64 pseudo-ops
4677            // FPSCR RMode: 00=nearest, 01=+inf(ceil), 10=-inf(floor), 11=zero(trunc)
4678            ArmOp::F64Ceil { dd, dm } => self.encode_thumb_f64_rounding(dd, dm, 0b01),
4679            ArmOp::F64Floor { dd, dm } => self.encode_thumb_f64_rounding(dd, dm, 0b10),
4680            ArmOp::F64Trunc { dd, dm } => self.encode_thumb_f64_rounding(dd, dm, 0b11),
4681            ArmOp::F64Nearest { dd, dm } => self.encode_thumb_f64_rounding(dd, dm, 0b00),
4682            ArmOp::F64Min { dd, dn, dm } => self.encode_thumb_f64_minmax(dd, dn, dm, true),
4683            ArmOp::F64Max { dd, dn, dm } => self.encode_thumb_f64_minmax(dd, dn, dm, false),
4684            ArmOp::F64Copysign { dd, dn, dm } => self.encode_thumb_f64_copysign(dd, dn, dm),
4685
4686            // f64 comparisons
4687            ArmOp::F64Eq { rd, dn, dm } => self.encode_thumb_f64_compare(rd, dn, dm, 0x0),
4688            ArmOp::F64Ne { rd, dn, dm } => self.encode_thumb_f64_compare(rd, dn, dm, 0x1),
4689            ArmOp::F64Lt { rd, dn, dm } => self.encode_thumb_f64_compare(rd, dn, dm, 0x4),
4690            ArmOp::F64Le { rd, dn, dm } => self.encode_thumb_f64_compare(rd, dn, dm, 0x9),
4691            ArmOp::F64Gt { rd, dn, dm } => self.encode_thumb_f64_compare(rd, dn, dm, 0xC),
4692            ArmOp::F64Ge { rd, dn, dm } => self.encode_thumb_f64_compare(rd, dn, dm, 0xA),
4693
4694            ArmOp::F64Const { dd, value } => self.encode_thumb_f64_const(dd, *value),
4695
4696            ArmOp::F64Load { dd, addr } => Ok(vfp_to_thumb_bytes(encode_vfp_ldst_f64(
4697                0xED900B00, dd, addr,
4698            )?)),
4699            ArmOp::F64Store { dd, addr } => Ok(vfp_to_thumb_bytes(encode_vfp_ldst_f64(
4700                0xED800B00, dd, addr,
4701            )?)),
4702
4703            ArmOp::F64ConvertI32S { dd, rm } => self.encode_thumb_f64_convert_i32(dd, rm, true),
4704            ArmOp::F64ConvertI32U { dd, rm } => self.encode_thumb_f64_convert_i32(dd, rm, false),
4705            ArmOp::F64ConvertI64S { .. } | ArmOp::F64ConvertI64U { .. } => {
4706                Err(synth_core::Error::synthesis(
4707                    "F64 i64 conversion not supported (requires register pairs on 32-bit ARM)",
4708                ))
4709            }
4710            ArmOp::F64PromoteF32 { dd, sm } => self.encode_thumb_f64_promote_f32(dd, sm),
4711            ArmOp::F64ReinterpretI64 { dd, rmlo, rmhi } => Ok(vfp_to_thumb_bytes(
4712                encode_vmov_core_dreg(true, dd, rmlo, rmhi)?,
4713            )),
4714            ArmOp::I64ReinterpretF64 { rdlo, rdhi, dm } => Ok(vfp_to_thumb_bytes(
4715                encode_vmov_core_dreg(false, dm, rdlo, rdhi)?,
4716            )),
4717            ArmOp::I64TruncF64S { .. } | ArmOp::I64TruncF64U { .. } => {
4718                Err(synth_core::Error::synthesis(
4719                    "i64 truncation from F64 not supported (requires i64 register pairs on 32-bit ARM)",
4720                ))
4721            }
4722            ArmOp::I32TruncF64S { rd, dm } => self.encode_thumb_i32_trunc_f64(rd, dm, true),
4723            ArmOp::I32TruncF64U { rd, dm } => self.encode_thumb_i32_trunc_f64(rd, dm, false),
4724
4725            // ===== i64 operations: encode as multi-instruction Thumb-2 sequences =====
4726
4727            // I64Add: ADDS rdlo, rnlo, rmlo; ADC.W rdhi, rnhi, rmhi
4728            ArmOp::I64Add {
4729                rdlo,
4730                rdhi,
4731                rnlo,
4732                rnhi,
4733                rmlo,
4734                rmhi,
4735            } => {
4736                let mut bytes = Vec::new();
4737                // ADDS rdlo, rnlo, rmlo (16-bit)
4738                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Adds {
4739                    rd: *rdlo,
4740                    rn: *rnlo,
4741                    op2: Operand2::Reg(*rmlo),
4742                })?);
4743                // ADC.W rdhi, rnhi, rmhi (32-bit)
4744                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Adc {
4745                    rd: *rdhi,
4746                    rn: *rnhi,
4747                    op2: Operand2::Reg(*rmhi),
4748                })?);
4749                Ok(bytes)
4750            }
4751
4752            // I64Sub: SUBS rdlo, rnlo, rmlo; SBC.W rdhi, rnhi, rmhi
4753            ArmOp::I64Sub {
4754                rdlo,
4755                rdhi,
4756                rnlo,
4757                rnhi,
4758                rmlo,
4759                rmhi,
4760            } => {
4761                let mut bytes = Vec::new();
4762                // SUBS rdlo, rnlo, rmlo (16-bit)
4763                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Subs {
4764                    rd: *rdlo,
4765                    rn: *rnlo,
4766                    op2: Operand2::Reg(*rmlo),
4767                })?);
4768                // SBC.W rdhi, rnhi, rmhi (32-bit)
4769                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Sbc {
4770                    rd: *rdhi,
4771                    rn: *rnhi,
4772                    op2: Operand2::Reg(*rmhi),
4773                })?);
4774                Ok(bytes)
4775            }
4776
4777            // I64And: AND rdlo, rnlo, rmlo; AND rdhi, rnhi, rmhi
4778            ArmOp::I64And {
4779                rdlo,
4780                rdhi,
4781                rnlo,
4782                rnhi,
4783                rmlo,
4784                rmhi,
4785            } => {
4786                let mut bytes = Vec::new();
4787                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::And {
4788                    rd: *rdlo,
4789                    rn: *rnlo,
4790                    op2: Operand2::Reg(*rmlo),
4791                })?);
4792                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::And {
4793                    rd: *rdhi,
4794                    rn: *rnhi,
4795                    op2: Operand2::Reg(*rmhi),
4796                })?);
4797                Ok(bytes)
4798            }
4799
4800            // I64Or: ORR rdlo, rnlo, rmlo; ORR rdhi, rnhi, rmhi
4801            ArmOp::I64Or {
4802                rdlo,
4803                rdhi,
4804                rnlo,
4805                rnhi,
4806                rmlo,
4807                rmhi,
4808            } => {
4809                let mut bytes = Vec::new();
4810                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Orr {
4811                    rd: *rdlo,
4812                    rn: *rnlo,
4813                    op2: Operand2::Reg(*rmlo),
4814                })?);
4815                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Orr {
4816                    rd: *rdhi,
4817                    rn: *rnhi,
4818                    op2: Operand2::Reg(*rmhi),
4819                })?);
4820                Ok(bytes)
4821            }
4822
4823            // I64Xor: EOR rdlo, rnlo, rmlo; EOR rdhi, rnhi, rmhi
4824            ArmOp::I64Xor {
4825                rdlo,
4826                rdhi,
4827                rnlo,
4828                rnhi,
4829                rmlo,
4830                rmhi,
4831            } => {
4832                let mut bytes = Vec::new();
4833                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Eor {
4834                    rd: *rdlo,
4835                    rn: *rnlo,
4836                    op2: Operand2::Reg(*rmlo),
4837                })?);
4838                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Eor {
4839                    rd: *rdhi,
4840                    rn: *rnhi,
4841                    op2: Operand2::Reg(*rmhi),
4842                })?);
4843                Ok(bytes)
4844            }
4845
4846            // I64Eqz: ORR scratch, lo, hi; ITE EQ; MOV rd, #1; MOV rd, #0
4847            ArmOp::I64Eqz { rd, rnlo, rnhi } => self.encode_thumb(&ArmOp::I64SetCondZ {
4848                rd: *rd,
4849                rn_lo: *rnlo,
4850                rn_hi: *rnhi,
4851            }),
4852
4853            // I64 comparisons: delegate to I64SetCond
4854            ArmOp::I64Eq {
4855                rd,
4856                rnlo,
4857                rnhi,
4858                rmlo,
4859                rmhi,
4860            } => self.encode_thumb(&ArmOp::I64SetCond {
4861                rd: *rd,
4862                rn_lo: *rnlo,
4863                rn_hi: *rnhi,
4864                rm_lo: *rmlo,
4865                rm_hi: *rmhi,
4866                cond: synth_synthesis::Condition::EQ,
4867            }),
4868
4869            ArmOp::I64Ne {
4870                rd,
4871                rnlo,
4872                rnhi,
4873                rmlo,
4874                rmhi,
4875            } => self.encode_thumb(&ArmOp::I64SetCond {
4876                rd: *rd,
4877                rn_lo: *rnlo,
4878                rn_hi: *rnhi,
4879                rm_lo: *rmlo,
4880                rm_hi: *rmhi,
4881                cond: synth_synthesis::Condition::NE,
4882            }),
4883
4884            ArmOp::I64LtS {
4885                rd,
4886                rnlo,
4887                rnhi,
4888                rmlo,
4889                rmhi,
4890            } => self.encode_thumb(&ArmOp::I64SetCond {
4891                rd: *rd,
4892                rn_lo: *rnlo,
4893                rn_hi: *rnhi,
4894                rm_lo: *rmlo,
4895                rm_hi: *rmhi,
4896                cond: synth_synthesis::Condition::LT,
4897            }),
4898
4899            ArmOp::I64LtU {
4900                rd,
4901                rnlo,
4902                rnhi,
4903                rmlo,
4904                rmhi,
4905            } => self.encode_thumb(&ArmOp::I64SetCond {
4906                rd: *rd,
4907                rn_lo: *rnlo,
4908                rn_hi: *rnhi,
4909                rm_lo: *rmlo,
4910                rm_hi: *rmhi,
4911                cond: synth_synthesis::Condition::LO,
4912            }),
4913
4914            ArmOp::I64LeS {
4915                rd,
4916                rnlo,
4917                rnhi,
4918                rmlo,
4919                rmhi,
4920            } => self.encode_thumb(&ArmOp::I64SetCond {
4921                rd: *rd,
4922                rn_lo: *rnlo,
4923                rn_hi: *rnhi,
4924                rm_lo: *rmlo,
4925                rm_hi: *rmhi,
4926                cond: synth_synthesis::Condition::LE,
4927            }),
4928
4929            ArmOp::I64LeU {
4930                rd,
4931                rnlo,
4932                rnhi,
4933                rmlo,
4934                rmhi,
4935            } => self.encode_thumb(&ArmOp::I64SetCond {
4936                rd: *rd,
4937                rn_lo: *rnlo,
4938                rn_hi: *rnhi,
4939                rm_lo: *rmlo,
4940                rm_hi: *rmhi,
4941                cond: synth_synthesis::Condition::LS,
4942            }),
4943
4944            ArmOp::I64GtS {
4945                rd,
4946                rnlo,
4947                rnhi,
4948                rmlo,
4949                rmhi,
4950            } => self.encode_thumb(&ArmOp::I64SetCond {
4951                rd: *rd,
4952                rn_lo: *rnlo,
4953                rn_hi: *rnhi,
4954                rm_lo: *rmlo,
4955                rm_hi: *rmhi,
4956                cond: synth_synthesis::Condition::GT,
4957            }),
4958
4959            ArmOp::I64GtU {
4960                rd,
4961                rnlo,
4962                rnhi,
4963                rmlo,
4964                rmhi,
4965            } => self.encode_thumb(&ArmOp::I64SetCond {
4966                rd: *rd,
4967                rn_lo: *rnlo,
4968                rn_hi: *rnhi,
4969                rm_lo: *rmlo,
4970                rm_hi: *rmhi,
4971                cond: synth_synthesis::Condition::HI,
4972            }),
4973
4974            ArmOp::I64GeS {
4975                rd,
4976                rnlo,
4977                rnhi,
4978                rmlo,
4979                rmhi,
4980            } => self.encode_thumb(&ArmOp::I64SetCond {
4981                rd: *rd,
4982                rn_lo: *rnlo,
4983                rn_hi: *rnhi,
4984                rm_lo: *rmlo,
4985                rm_hi: *rmhi,
4986                cond: synth_synthesis::Condition::GE,
4987            }),
4988
4989            ArmOp::I64GeU {
4990                rd,
4991                rnlo,
4992                rnhi,
4993                rmlo,
4994                rmhi,
4995            } => self.encode_thumb(&ArmOp::I64SetCond {
4996                rd: *rd,
4997                rn_lo: *rnlo,
4998                rn_hi: *rnhi,
4999                rm_lo: *rmlo,
5000                rm_hi: *rmhi,
5001                cond: synth_synthesis::Condition::HS,
5002            }),
5003
5004            // I64Const: MOVW rdlo, lo16; MOVT rdlo, hi16; MOVW rdhi, lo16_hi; MOVT rdhi, hi16_hi
5005            ArmOp::I64Const { rdlo, rdhi, value } => {
5006                let lo32 = *value as u32;
5007                let hi32 = (*value >> 32) as u32;
5008                let mut bytes = Vec::new();
5009                // Load low 32 bits into rdlo
5010                bytes.extend_from_slice(
5011                    &self.encode_thumb32_movw_raw(reg_to_bits(rdlo), lo32 & 0xFFFF)?,
5012                );
5013                if lo32 > 0xFFFF {
5014                    bytes.extend_from_slice(
5015                        &self.encode_thumb32_movt_raw(reg_to_bits(rdlo), lo32 >> 16)?,
5016                    );
5017                }
5018                // Load high 32 bits into rdhi
5019                bytes.extend_from_slice(
5020                    &self.encode_thumb32_movw_raw(reg_to_bits(rdhi), hi32 & 0xFFFF)?,
5021                );
5022                if hi32 > 0xFFFF {
5023                    bytes.extend_from_slice(
5024                        &self.encode_thumb32_movt_raw(reg_to_bits(rdhi), hi32 >> 16)?,
5025                    );
5026                }
5027                Ok(bytes)
5028            }
5029
5030            // I64Ldr: LDR rdlo, [base, offset]; LDR rdhi, [base, offset+4]
5031            ArmOp::I64Ldr { rdlo, rdhi, addr } => {
5032                let mut bytes = Vec::new();
5033                let offset = if addr.offset < 0 {
5034                    0u32
5035                } else {
5036                    addr.offset as u32
5037                };
5038                bytes.extend_from_slice(&self.encode_thumb32_ldr(rdlo, &addr.base, offset)?);
5039                bytes.extend_from_slice(&self.encode_thumb32_ldr(
5040                    rdhi,
5041                    &addr.base,
5042                    offset.wrapping_add(4),
5043                )?);
5044                Ok(bytes)
5045            }
5046
5047            // I64Str: STR rdlo, [base, offset]; STR rdhi, [base, offset+4]
5048            ArmOp::I64Str { rdlo, rdhi, addr } => {
5049                let mut bytes = Vec::new();
5050                let offset = if addr.offset < 0 {
5051                    0u32
5052                } else {
5053                    addr.offset as u32
5054                };
5055                bytes.extend_from_slice(&self.encode_thumb32_str(rdlo, &addr.base, offset)?);
5056                bytes.extend_from_slice(&self.encode_thumb32_str(
5057                    rdhi,
5058                    &addr.base,
5059                    offset.wrapping_add(4),
5060                )?);
5061                Ok(bytes)
5062            }
5063
5064            // I64ExtendI32S: MOV rdlo, rn; ASR rdhi, rdlo, #31 (sign-extend)
5065            ArmOp::I64ExtendI32S { rdlo, rdhi, rn } => {
5066                let mut bytes = Vec::new();
5067                if rdlo != rn {
5068                    // MOV rdlo, rn (16-bit)
5069                    bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Mov {
5070                        rd: *rdlo,
5071                        op2: Operand2::Reg(*rn),
5072                    })?);
5073                }
5074                // ASR rdhi, rdlo, #31 (sign-extend: fill high word with sign bit)
5075                bytes.extend_from_slice(
5076                    &self.encode_thumb32_shift(rdhi, rdlo, 31, 0b10)?, // ASR type
5077                );
5078                Ok(bytes)
5079            }
5080
5081            // I64ExtendI32U: MOV rdlo, rn; MOV rdhi, #0
5082            ArmOp::I64ExtendI32U { rdlo, rdhi, rn } => {
5083                let mut bytes = Vec::new();
5084                if rdlo != rn {
5085                    // MOV rdlo, rn
5086                    bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Mov {
5087                        rd: *rdlo,
5088                        op2: Operand2::Reg(*rn),
5089                    })?);
5090                }
5091                // MOV rdhi, #0 (16-bit: MOVS Rd, #0)
5092                let rdhi_bits = reg_to_bits(rdhi) as u16;
5093                let instr: u16 = 0x2000 | (rdhi_bits << 8);
5094                bytes.extend_from_slice(&instr.to_le_bytes());
5095                Ok(bytes)
5096            }
5097
5098            // I32WrapI64: MOV rd, rnlo (just take low 32 bits)
5099            ArmOp::I32WrapI64 { rd, rnlo } => {
5100                if rd == rnlo {
5101                    // No-op: already in the right register
5102                    let instr: u16 = 0xBF00; // NOP
5103                    Ok(instr.to_le_bytes().to_vec())
5104                } else {
5105                    // MOV rd, rnlo
5106                    self.encode_thumb(&ArmOp::Mov {
5107                        rd: *rd,
5108                        op2: Operand2::Reg(*rnlo),
5109                    })
5110                }
5111            }
5112
5113            // ===== Helium MVE operations (Thumb-2 encoding) =====
5114            ArmOp::MveLoad { qd, addr } => Ok(vfp_to_thumb_bytes(encode_mve_vldrw(qd, addr))),
5115            ArmOp::MveStore { qd, addr } => Ok(vfp_to_thumb_bytes(encode_mve_vstrw(qd, addr))),
5116            ArmOp::MveConst { qd, bytes } => self.encode_thumb_mve_const(qd, bytes),
5117            ArmOp::MveAnd { qd, qn, qm } => Ok(vfp_to_thumb_bytes(encode_mve_3reg_bitwise(
5118                0xEF000150, qd, qn, qm,
5119            ))),
5120            ArmOp::MveOrr { qd, qn, qm } => Ok(vfp_to_thumb_bytes(encode_mve_3reg_bitwise(
5121                0xEF200150, qd, qn, qm,
5122            ))),
5123            ArmOp::MveEor { qd, qn, qm } => Ok(vfp_to_thumb_bytes(encode_mve_3reg_bitwise(
5124                0xFF000150, qd, qn, qm,
5125            ))),
5126            ArmOp::MveMvn { qd, qm } => {
5127                // VMVN Qd, Qm: 0xFFB005C0 | Qd<<12 | Qm
5128                let qd_enc = qreg_to_num(qd);
5129                let qm_enc = qreg_to_num(qm);
5130                let instr: u32 = 0xFFB005C0 | ((qd_enc * 2) << 12) | (qm_enc * 2);
5131                Ok(vfp_to_thumb_bytes(instr))
5132            }
5133            ArmOp::MveBic { qd, qn, qm } => Ok(vfp_to_thumb_bytes(encode_mve_3reg_bitwise(
5134                0xEF100150, qd, qn, qm,
5135            ))),
5136            ArmOp::MveAddI { qd, qn, qm, size } => {
5137                let sz = mve_size_bits(size);
5138                let base: u32 = 0xEF000840 | (sz << 20);
5139                Ok(vfp_to_thumb_bytes(encode_mve_3reg(base, qd, qn, qm)))
5140            }
5141            ArmOp::MveSubI { qd, qn, qm, size } => {
5142                let sz = mve_size_bits(size);
5143                let base: u32 = 0xFF000840 | (sz << 20);
5144                Ok(vfp_to_thumb_bytes(encode_mve_3reg(base, qd, qn, qm)))
5145            }
5146            ArmOp::MveMulI { qd, qn, qm, size } => {
5147                let sz = mve_size_bits(size);
5148                let base: u32 = 0xEF000950 | (sz << 20);
5149                Ok(vfp_to_thumb_bytes(encode_mve_3reg(base, qd, qn, qm)))
5150            }
5151            ArmOp::MveNegI { qd, qm, size } => {
5152                let sz = mve_size_bits(size);
5153                // VNEG.Sx Qd, Qm
5154                let qd_enc = qreg_to_num(qd);
5155                let qm_enc = qreg_to_num(qm);
5156                let base: u32 = 0xFFB103C0 | (sz << 18);
5157                let instr = base | ((qd_enc * 2) << 12) | (qm_enc * 2);
5158                Ok(vfp_to_thumb_bytes(instr))
5159            }
5160            ArmOp::MveDup { qd, rn, size } => {
5161                let sz = mve_size_bits(size);
5162                let qd_enc = qreg_to_num(qd);
5163                let rn_bits = reg_to_bits(rn);
5164                // VDUP.sz Qd, Rn: EEA0 0B10 variant
5165                // size encoding: 00=32, 01=16, 10=8
5166                let be = match sz {
5167                    0 => 0b00u32, // 8-bit
5168                    1 => 0b01,    // 16-bit
5169                    _ => 0b00,    // 32-bit (default)
5170                };
5171                let instr: u32 = 0xEEA00B10 | ((qd_enc * 2) << 16) | (rn_bits << 12) | (be << 5);
5172                Ok(vfp_to_thumb_bytes(instr))
5173            }
5174            ArmOp::MveExtractLane { rd, qn, lane, size } => {
5175                let qn_enc = qreg_to_num(qn);
5176                let rd_bits = reg_to_bits(rd);
5177                // VMOV.sz Rd, Dn[x] — extract from Q-register lane
5178                // For 32-bit: VMOV Rd, Dn — where Dn is the appropriate D-register
5179                let d_reg = qn_enc * 2 + ((*lane as u32) >> 1);
5180                let lane_in_d = (*lane as u32) & 1;
5181                let _sz = mve_size_bits(size);
5182                // VMOV Rd, Dn[x]: EE10 0B10 for 32-bit
5183                let instr: u32 = 0xEE100B10 | (d_reg << 16) | (rd_bits << 12) | (lane_in_d << 21);
5184                Ok(vfp_to_thumb_bytes(instr))
5185            }
5186            ArmOp::MveInsertLane { qd, rn, lane, size } => {
5187                let qd_enc = qreg_to_num(qd);
5188                let rn_bits = reg_to_bits(rn);
5189                let d_reg = qd_enc * 2 + ((*lane as u32) >> 1);
5190                let lane_in_d = (*lane as u32) & 1;
5191                let _sz = mve_size_bits(size);
5192                // VMOV Dn[x], Rn: EE00 0B10 for 32-bit
5193                let instr: u32 = 0xEE000B10 | (d_reg << 16) | (rn_bits << 12) | (lane_in_d << 21);
5194                Ok(vfp_to_thumb_bytes(instr))
5195            }
5196
5197            // MVE float comparisons — emit VCMP + VPSEL sequence (simplified: just VCMP)
5198            ArmOp::MveCmpEqI { qd, qn, qm, size }
5199            | ArmOp::MveCmpNeI { qd, qn, qm, size }
5200            | ArmOp::MveCmpLtS { qd, qn, qm, size }
5201            | ArmOp::MveCmpLtU { qd, qn, qm, size }
5202            | ArmOp::MveCmpGtS { qd, qn, qm, size }
5203            | ArmOp::MveCmpGtU { qd, qn, qm, size }
5204            | ArmOp::MveCmpLeS { qd, qn, qm, size }
5205            | ArmOp::MveCmpLeU { qd, qn, qm, size }
5206            | ArmOp::MveCmpGeS { qd, qn, qm, size }
5207            | ArmOp::MveCmpGeU { qd, qn, qm, size } => {
5208                // Encode as VADD (placeholder encoding — real implementation
5209                // would use VCMP + VPSEL pair)
5210                let sz = mve_size_bits(size);
5211                let base: u32 = 0xEF000840 | (sz << 20);
5212                Ok(vfp_to_thumb_bytes(encode_mve_3reg(base, qd, qn, qm)))
5213            }
5214
5215            // f32x4 MVE arithmetic
5216            ArmOp::MveAddF32 { qd, qn, qm } => {
5217                // VADD.F32 Qd, Qn, Qm (MVE): 0xEF000D40
5218                Ok(vfp_to_thumb_bytes(encode_mve_3reg(0xEF000D40, qd, qn, qm)))
5219            }
5220            ArmOp::MveSubF32 { qd, qn, qm } => {
5221                // VSUB.F32 Qd, Qn, Qm (MVE): 0xEF200D40
5222                Ok(vfp_to_thumb_bytes(encode_mve_3reg(0xEF200D40, qd, qn, qm)))
5223            }
5224            ArmOp::MveMulF32 { qd, qn, qm } => {
5225                // VMUL.F32 Qd, Qn, Qm (MVE): 0xFF000D50
5226                Ok(vfp_to_thumb_bytes(encode_mve_3reg(0xFF000D50, qd, qn, qm)))
5227            }
5228            ArmOp::MveNegF32 { qd, qm } => {
5229                let qd_enc = qreg_to_num(qd);
5230                let qm_enc = qreg_to_num(qm);
5231                // VNEG.F32 Qd, Qm: FFB907C0
5232                let instr: u32 = 0xFFB907C0 | ((qd_enc * 2) << 12) | (qm_enc * 2);
5233                Ok(vfp_to_thumb_bytes(instr))
5234            }
5235            ArmOp::MveAbsF32 { qd, qm } => {
5236                let qd_enc = qreg_to_num(qd);
5237                let qm_enc = qreg_to_num(qm);
5238                // VABS.F32 Qd, Qm: FFB90740
5239                let instr: u32 = 0xFFB90740 | ((qd_enc * 2) << 12) | (qm_enc * 2);
5240                Ok(vfp_to_thumb_bytes(instr))
5241            }
5242            ArmOp::MveCmpEqF32 { qd, qn, qm }
5243            | ArmOp::MveCmpNeF32 { qd, qn, qm }
5244            | ArmOp::MveCmpLtF32 { qd, qn, qm }
5245            | ArmOp::MveCmpLeF32 { qd, qn, qm }
5246            | ArmOp::MveCmpGtF32 { qd, qn, qm }
5247            | ArmOp::MveCmpGeF32 { qd, qn, qm } => {
5248                // Placeholder: encode as VADD.F32 (real impl needs VCMP.F32 + VPSEL)
5249                Ok(vfp_to_thumb_bytes(encode_mve_3reg(0xEF000D40, qd, qn, qm)))
5250            }
5251            ArmOp::MveDupF32 { qd, rn } => {
5252                let qd_enc = qreg_to_num(qd);
5253                let rn_bits = reg_to_bits(rn);
5254                // VDUP.32 Qd, Rn (same encoding as integer VDUP.32)
5255                let instr: u32 = 0xEEA00B10 | ((qd_enc * 2) << 16) | (rn_bits << 12);
5256                Ok(vfp_to_thumb_bytes(instr))
5257            }
5258            ArmOp::MveExtractLaneF32 { rd, qn, lane } => {
5259                let qn_enc = qreg_to_num(qn);
5260                let rd_bits = reg_to_bits(rd);
5261                // VMOV Rd, Sn where Sn = Q*4 + lane
5262                let s_num = qn_enc * 4 + (*lane as u32);
5263                let (vn, n) = encode_sreg(s_num);
5264                let instr: u32 = 0xEE100A10 | (vn << 16) | (rd_bits << 12) | (n << 7);
5265                Ok(vfp_to_thumb_bytes(instr))
5266            }
5267            ArmOp::MveReplaceLaneF32 { qd, rn, lane } => {
5268                let qd_enc = qreg_to_num(qd);
5269                let rn_bits = reg_to_bits(rn);
5270                // VMOV Sn, Rn where Sn = Q*4 + lane
5271                let s_num = qd_enc * 4 + (*lane as u32);
5272                let (vn, n) = encode_sreg(s_num);
5273                let instr: u32 = 0xEE000A10 | (vn << 16) | (rn_bits << 12) | (n << 7);
5274                Ok(vfp_to_thumb_bytes(instr))
5275            }
5276            ArmOp::MveDivF32 { qd, qn, qm } => {
5277                // Lane-wise: extract 4 S-regs, VDIV, insert back
5278                self.encode_thumb_mve_lane_wise_f32_binop(qd, qn, qm, 0xEE800A00)
5279            }
5280            ArmOp::MveSqrtF32 { qd, qm } => {
5281                // Lane-wise: extract 4 S-regs, VSQRT, insert back
5282                self.encode_thumb_mve_lane_wise_f32_sqrt(qd, qm)
5283            }
5284
5285            // Catch-all for any remaining ops
5286            _ => {
5287                let instr: u16 = 0xBF00; // NOP
5288                Ok(instr.to_le_bytes().to_vec())
5289            }
5290        }
5291    }
5292
5293    // === Thumb-2 VFP multi-instruction helpers ===
5294
5295    /// Encode F32 comparison as Thumb-2: VCMP.F32 + VMRS + MOVS rd,#0 + IT + MOV rd,#1
5296    fn encode_thumb_f32_compare(
5297        &self,
5298        rd: &Reg,
5299        sn: &VfpReg,
5300        sm: &VfpReg,
5301        cond_code: u32,
5302    ) -> Result<Vec<u8>> {
5303        let mut bytes = Vec::new();
5304        let rd_bits = reg_to_bits(rd);
5305
5306        // VCMP.F32 Sn, Sm
5307        let sn_num = vfp_sreg_to_num(sn)?;
5308        let sm_num = vfp_sreg_to_num(sm)?;
5309        let (vd, d) = encode_sreg(sn_num);
5310        let (vm, m) = encode_sreg(sm_num);
5311        let vcmp = 0xEEB40A40 | (d << 22) | (vd << 12) | (m << 5) | vm;
5312        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcmp));
5313
5314        // VMRS APSR_nzcv, FPSCR: 0xEEF1FA10
5315        bytes.extend_from_slice(&vfp_to_thumb_bytes(0xEEF1FA10));
5316
5317        // MOVS Rd, #0 (16-bit): 0010 0 Rd(3) 0000 0000
5318        if rd_bits < 8 {
5319            let movs_zero: u16 = 0x2000 | ((rd_bits as u16) << 8);
5320            bytes.extend_from_slice(&movs_zero.to_le_bytes());
5321        } else {
5322            // MOV.W Rd, #0 (32-bit Thumb-2)
5323            let hw1: u16 = 0xF04F;
5324            let hw2: u16 = (rd_bits as u16) << 8;
5325            bytes.extend_from_slice(&hw1.to_le_bytes());
5326            bytes.extend_from_slice(&hw2.to_le_bytes());
5327        }
5328
5329        // IT<cond> — If-Then for conditional MOV
5330        // IT encoding: 1011 1111 cond(4) mask(4)
5331        // mask = 0x8 for single "then" (IT)
5332        let it: u16 = 0xBF00 | ((cond_code as u16) << 4) | 0x8;
5333        bytes.extend_from_slice(&it.to_le_bytes());
5334
5335        // MOV Rd, #1 (16-bit, conditional due to IT): 0010 0 Rd(3) 0000 0001
5336        if rd_bits < 8 {
5337            let mov_one: u16 = 0x2001 | ((rd_bits as u16) << 8);
5338            bytes.extend_from_slice(&mov_one.to_le_bytes());
5339        } else {
5340            // MOV.W Rd, #1 (32-bit)
5341            let hw1: u16 = 0xF04F;
5342            let hw2: u16 = ((rd_bits as u16) << 8) | 0x01;
5343            bytes.extend_from_slice(&hw1.to_le_bytes());
5344            bytes.extend_from_slice(&hw2.to_le_bytes());
5345        }
5346
5347        Ok(bytes)
5348    }
5349
5350    /// Encode F32 constant load as Thumb-2: MOVW + MOVT + VMOV
5351    fn encode_thumb_f32_const(&self, sd: &VfpReg, value: f32) -> Result<Vec<u8>> {
5352        let mut bytes = Vec::new();
5353        let bits = value.to_bits();
5354        let rt: u32 = 12; // R12/IP as temp
5355
5356        // MOVW R12, #lo16
5357        // Thumb-2 MOVW: 11110 i 10 0100 imm4 | 0 imm3 Rd imm8
5358        let lo16 = bits & 0xFFFF;
5359        let imm4 = (lo16 >> 12) & 0xF;
5360        let i_bit = (lo16 >> 11) & 1;
5361        let imm3 = (lo16 >> 8) & 0x7;
5362        let imm8 = lo16 & 0xFF;
5363        let hw1: u16 = (0xF240 | (i_bit << 10) | imm4) as u16;
5364        let hw2: u16 = ((imm3 << 12) | (rt << 8) | imm8) as u16;
5365        bytes.extend_from_slice(&hw1.to_le_bytes());
5366        bytes.extend_from_slice(&hw2.to_le_bytes());
5367
5368        // MOVT R12, #hi16
5369        let hi16 = (bits >> 16) & 0xFFFF;
5370        let imm4 = (hi16 >> 12) & 0xF;
5371        let i_bit = (hi16 >> 11) & 1;
5372        let imm3 = (hi16 >> 8) & 0x7;
5373        let imm8 = hi16 & 0xFF;
5374        let hw1: u16 = (0xF2C0 | (i_bit << 10) | imm4) as u16;
5375        let hw2: u16 = ((imm3 << 12) | (rt << 8) | imm8) as u16;
5376        bytes.extend_from_slice(&hw1.to_le_bytes());
5377        bytes.extend_from_slice(&hw2.to_le_bytes());
5378
5379        // VMOV Sd, R12
5380        let vmov = encode_vmov_core_sreg(true, sd, &Reg::R12)?;
5381        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov));
5382
5383        Ok(bytes)
5384    }
5385
5386    /// Encode VMOV + VCVT.F32.xS32 as Thumb-2
5387    fn encode_thumb_f32_convert_i32(&self, sd: &VfpReg, rm: &Reg, signed: bool) -> Result<Vec<u8>> {
5388        let mut bytes = Vec::new();
5389
5390        // VMOV Sd, Rm
5391        let vmov = encode_vmov_core_sreg(true, sd, rm)?;
5392        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov));
5393
5394        // VCVT.F32.S32/U32 Sd, Sd
5395        let sd_num = vfp_sreg_to_num(sd)?;
5396        let (vd, d) = encode_sreg(sd_num);
5397        let (vm, m) = encode_sreg(sd_num);
5398        let base = if signed { 0xEEB80A40 } else { 0xEEB80AC0 };
5399        let vcvt = base | (d << 22) | (vd << 12) | (m << 5) | vm;
5400        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt));
5401
5402        Ok(bytes)
5403    }
5404
5405    /// Encode F32 rounding pseudo-op as Thumb-2 via VCVT to integer and back
5406    /// Encode F32 rounding as Thumb-2.
5407    /// `mode`: FPSCR RMode — 0b00=nearest, 0b01=+inf(ceil), 0b10=-inf(floor), 0b11=zero(trunc)
5408    ///
5409    /// For trunc: uses VCVTR.S32.F32 (always truncates).
5410    /// For ceil/floor/nearest: sets FPSCR rounding mode, uses VCVT.S32.F32 (non-R variant),
5411    /// then restores FPSCR.
5412    fn encode_thumb_f32_rounding(&self, sd: &VfpReg, sm: &VfpReg, mode: u8) -> Result<Vec<u8>> {
5413        let mut bytes = Vec::new();
5414        let sm_num = vfp_sreg_to_num(sm)?;
5415        let sd_num = vfp_sreg_to_num(sd)?;
5416        let (vd_s, d_s) = encode_sreg(sd_num);
5417        let (vm_s, m_s) = encode_sreg(sm_num);
5418
5419        if mode == 0b11 {
5420            // Trunc (toward zero): VCVTR.S32.F32 — bit[7]=1, always truncates
5421            let vcvt_to_int = 0xEEBD0AC0 | (d_s << 22) | (vd_s << 12) | (m_s << 5) | vm_s;
5422            bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_int));
5423        } else {
5424            // ceil/floor/nearest: manipulate FPSCR rounding mode
5425            let rt: u32 = 12; // R12/IP as temp
5426
5427            // VMRS R12, FPSCR
5428            let vmrs = 0xEEF10A10 | (rt << 12);
5429            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmrs));
5430
5431            // BIC.W R12, R12, #(3 << 22) — clear RMode bits [23:22]
5432            // Thumb-2 modified immediate for 3<<22 = 0x00C00000:
5433            // BIC.W encoding: 11110 i 0 0001 S Rn | 0 imm3 Rd imm8
5434            // 0x00C00000 = 0x03 shifted left by 22 => Thumb mod-imm: i=0, imm3=0b101, imm8=0x03
5435            let bic_hw1: u16 = 0xF020 | ((rt as u16) & 0xF); // BIC, Rn=R12
5436            let bic_hw2: u16 = (0x05 << 12) | ((rt as u16) << 8) | 0x03;
5437            bytes.extend_from_slice(&bic_hw1.to_le_bytes());
5438            bytes.extend_from_slice(&bic_hw2.to_le_bytes());
5439
5440            // ORR.W R12, R12, #(mode << 22)
5441            if mode != 0 {
5442                let orr_hw1: u16 = 0xF040 | ((rt as u16) & 0xF); // ORR, Rn=R12
5443                let orr_hw2: u16 = (0x05 << 12) | ((rt as u16) << 8) | (mode as u16);
5444                bytes.extend_from_slice(&orr_hw1.to_le_bytes());
5445                bytes.extend_from_slice(&orr_hw2.to_le_bytes());
5446            }
5447
5448            // VMSR FPSCR, R12
5449            let vmsr = 0xEEE10A10 | (rt << 12);
5450            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmsr));
5451
5452            // VCVT.S32.F32 Sd, Sm — non-R variant (bit[7]=0), uses FPSCR rmode
5453            let vcvt_to_int = 0xEEBD0A40 | (d_s << 22) | (vd_s << 12) | (m_s << 5) | vm_s;
5454            bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_int));
5455
5456            // Restore FPSCR: clear rmode bits back to nearest (default)
5457            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmrs));
5458            bytes.extend_from_slice(&bic_hw1.to_le_bytes());
5459            bytes.extend_from_slice(&bic_hw2.to_le_bytes());
5460            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmsr));
5461        }
5462
5463        // VCVT.F32.S32 Sd, Sd (convert integer result back to float)
5464        let (vd2, d2) = encode_sreg(sd_num);
5465        let vcvt_to_float = 0xEEB80A40 | (d2 << 22) | (vd2 << 12) | (d_s << 5) | vd_s;
5466        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_float));
5467
5468        Ok(bytes)
5469    }
5470
5471    /// Encode F32 min/max as Thumb-2: VMOV + VCMP + VMRS + IT + VMOV
5472    fn encode_thumb_f32_minmax(
5473        &self,
5474        sd: &VfpReg,
5475        sn: &VfpReg,
5476        sm: &VfpReg,
5477        is_min: bool,
5478    ) -> Result<Vec<u8>> {
5479        let mut bytes = Vec::new();
5480        let sn_num = vfp_sreg_to_num(sn)?;
5481        let sm_num = vfp_sreg_to_num(sm)?;
5482        let sd_num = vfp_sreg_to_num(sd)?;
5483
5484        // VMOV.F32 Sd, Sn
5485        let (vd, d) = encode_sreg(sd_num);
5486        let (vn, n) = encode_sreg(sn_num);
5487        let vmov_sn = 0xEEB00A40 | (d << 22) | (vd << 12) | (n << 5) | vn;
5488        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov_sn));
5489
5490        // VCMP.F32 Sn, Sm
5491        let (vm, m) = encode_sreg(sm_num);
5492        let vcmp = 0xEEB40A40 | (n << 22) | (vn << 12) | (m << 5) | vm;
5493        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcmp));
5494
5495        // VMRS APSR_nzcv, FPSCR
5496        bytes.extend_from_slice(&vfp_to_thumb_bytes(0xEEF1FA10));
5497
5498        // IT GT (for min) or IT MI (for max)
5499        let cond: u16 = if is_min { 0xC } else { 0x4 };
5500        let it: u16 = 0xBF00 | (cond << 4) | 0x8;
5501        bytes.extend_from_slice(&it.to_le_bytes());
5502
5503        // VMOV{cond}.F32 Sd, Sm — conditional VMOV in IT block
5504        let vmov_sm = 0xEEB00A40 | (d << 22) | (vd << 12) | (m << 5) | vm;
5505        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov_sm));
5506
5507        Ok(bytes)
5508    }
5509
5510    /// Encode F32 copysign as Thumb-2
5511    fn encode_thumb_f32_copysign(&self, sd: &VfpReg, sn: &VfpReg, sm: &VfpReg) -> Result<Vec<u8>> {
5512        let mut bytes = Vec::new();
5513
5514        // VMOV R12, Sm (get sign source bits)
5515        bytes.extend_from_slice(&vfp_to_thumb_bytes(encode_vmov_core_sreg(
5516            false,
5517            sm,
5518            &Reg::R12,
5519        )?));
5520
5521        // VMOV R0, Sn (get magnitude source bits)
5522        bytes.extend_from_slice(&vfp_to_thumb_bytes(encode_vmov_core_sreg(
5523            false,
5524            sn,
5525            &Reg::R0,
5526        )?));
5527
5528        // AND.W R12, R12, #0x80000000
5529        // Thumb-2 modified immediate: 0x80000000 = constant 0x80 with rotation
5530        // Using T1 encoding: 11110 i 0 0000 S Rn | 0 imm3 Rd imm8
5531        // 0x80000000: i=0, imm3=0b001, imm8=0x00 (rotation=4, value=0x80)
5532        // Actually encoding #0x80000000 as modified constant:
5533        // bit pattern 1 followed by 31 zeros: enc = 0b0100_00000000 = 0x0100? No.
5534        // ARM modified immediate: abcdefgh rotated. 0x80000000 = 0x80 ROR 2 = enc 0x0102
5535        // Actually: value = abcdefgh ROR (2*rot). 0x80 = 10000000, ROR 2 gives 0x20000000.
5536        // For 0x80000000: 0x02 ROR 2 = 0x80000000. So imm12 = (1<<8) | 0x02 = 0x102
5537        let hw1: u16 = 0xF000 | 12; // AND.W R12, R12, #modified_const (i=0, Rn=R12)
5538        let hw2: u16 = (0x1 << 12) | (12 << 8) | 0x02; // imm3=1, Rd=R12, imm8=0x02
5539        bytes.extend_from_slice(&hw1.to_le_bytes());
5540        bytes.extend_from_slice(&hw2.to_le_bytes());
5541
5542        // BIC.W R0, R0, #0x80000000 (R0 = register 0, fields are zero)
5543        let hw1: u16 = 0xF020; // BIC.W R0, R0, #modified_const (i=0, Rn=R0)
5544        let hw2: u16 = (0x1 << 12) | 0x02; // imm3=1, Rd=R0, imm8=0x02
5545        bytes.extend_from_slice(&hw1.to_le_bytes());
5546        bytes.extend_from_slice(&hw2.to_le_bytes());
5547
5548        // ORR.W R0, R0, R12 (R0 = register 0)
5549        let hw1: u16 = 0xEA40; // ORR.W R0, R0, R12 (Rn=R0)
5550        let hw2: u16 = 12; // Rd=R0, Rm=R12
5551        bytes.extend_from_slice(&hw1.to_le_bytes());
5552        bytes.extend_from_slice(&hw2.to_le_bytes());
5553
5554        // VMOV Sd, R0
5555        bytes.extend_from_slice(&vfp_to_thumb_bytes(encode_vmov_core_sreg(
5556            true,
5557            sd,
5558            &Reg::R0,
5559        )?));
5560
5561        Ok(bytes)
5562    }
5563
5564    /// Encode F64 comparison as Thumb-2: VCMP.F64 + VMRS + MOV #0 + IT + MOV #1
5565    fn encode_thumb_f64_compare(
5566        &self,
5567        rd: &Reg,
5568        dn: &VfpReg,
5569        dm: &VfpReg,
5570        cond_code: u32,
5571    ) -> Result<Vec<u8>> {
5572        let mut bytes = Vec::new();
5573        let rd_bits = reg_to_bits(rd);
5574
5575        // VCMP.F64 Dn, Dm
5576        let dn_num = vfp_dreg_to_num(dn)?;
5577        let dm_num = vfp_dreg_to_num(dm)?;
5578        let (vd, d) = encode_dreg(dn_num);
5579        let (vm, m) = encode_dreg(dm_num);
5580        let vcmp = 0xEEB40B40 | (d << 22) | (vd << 12) | (m << 5) | vm;
5581        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcmp));
5582
5583        // VMRS APSR_nzcv, FPSCR
5584        bytes.extend_from_slice(&vfp_to_thumb_bytes(0xEEF1FA10));
5585
5586        // MOVS Rd, #0
5587        if rd_bits < 8 {
5588            let movs_zero: u16 = 0x2000 | ((rd_bits as u16) << 8);
5589            bytes.extend_from_slice(&movs_zero.to_le_bytes());
5590        } else {
5591            let hw1: u16 = 0xF04F;
5592            let hw2: u16 = (rd_bits as u16) << 8;
5593            bytes.extend_from_slice(&hw1.to_le_bytes());
5594            bytes.extend_from_slice(&hw2.to_le_bytes());
5595        }
5596
5597        // IT<cond>
5598        let it: u16 = 0xBF00 | ((cond_code as u16) << 4) | 0x8;
5599        bytes.extend_from_slice(&it.to_le_bytes());
5600
5601        // MOV Rd, #1
5602        if rd_bits < 8 {
5603            let mov_one: u16 = 0x2001 | ((rd_bits as u16) << 8);
5604            bytes.extend_from_slice(&mov_one.to_le_bytes());
5605        } else {
5606            let hw1: u16 = 0xF04F;
5607            let hw2: u16 = ((rd_bits as u16) << 8) | 0x01;
5608            bytes.extend_from_slice(&hw1.to_le_bytes());
5609            bytes.extend_from_slice(&hw2.to_le_bytes());
5610        }
5611
5612        Ok(bytes)
5613    }
5614
5615    /// Encode F64 constant load as Thumb-2: MOVW+MOVT (lo32 into R0) + MOVW+MOVT (hi32 into R12) + VMOV Dd, R0, R12
5616    fn encode_thumb_f64_const(&self, dd: &VfpReg, value: f64) -> Result<Vec<u8>> {
5617        let mut bytes = Vec::new();
5618        let bits = value.to_bits();
5619        let lo32 = bits as u32;
5620        let hi32 = (bits >> 32) as u32;
5621
5622        // MOVW R0, #lo16(lo32)
5623        let lo16 = lo32 & 0xFFFF;
5624        bytes.extend_from_slice(&self.encode_thumb32_movw_raw(0, lo16)?);
5625
5626        // MOVT R0, #hi16(lo32)
5627        let hi16 = (lo32 >> 16) & 0xFFFF;
5628        bytes.extend_from_slice(&self.encode_thumb32_movt_raw(0, hi16)?);
5629
5630        // MOVW R12, #lo16(hi32)
5631        let lo16 = hi32 & 0xFFFF;
5632        bytes.extend_from_slice(&self.encode_thumb32_movw_raw(12, lo16)?);
5633
5634        // MOVT R12, #hi16(hi32)
5635        let hi16 = (hi32 >> 16) & 0xFFFF;
5636        bytes.extend_from_slice(&self.encode_thumb32_movt_raw(12, hi16)?);
5637
5638        // VMOV Dd, R0, R12
5639        let vmov = encode_vmov_core_dreg(true, dd, &Reg::R0, &Reg::R12)?;
5640        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov));
5641
5642        Ok(bytes)
5643    }
5644
5645    /// Encode VMOV Sd, Rm + VCVT.F64.S32/U32 Dd, Sd as Thumb-2
5646    fn encode_thumb_f64_convert_i32(&self, dd: &VfpReg, rm: &Reg, signed: bool) -> Result<Vec<u8>> {
5647        let mut bytes = Vec::new();
5648
5649        // VMOV S0, Rm
5650        let vmov = encode_vmov_core_sreg(true, &VfpReg::S0, rm)?;
5651        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov));
5652
5653        // VCVT.F64.S32 Dd, S0 or VCVT.F64.U32 Dd, S0
5654        let dd_num = vfp_dreg_to_num(dd)?;
5655        let (vd, d) = encode_dreg(dd_num);
5656        let base = if signed { 0xEEB80B40 } else { 0xEEB80BC0 };
5657        let vcvt = base | (d << 22) | (vd << 12);
5658        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt));
5659
5660        Ok(bytes)
5661    }
5662
5663    /// Encode VCVT.F64.F32 Dd, Sm as Thumb-2
5664    fn encode_thumb_f64_promote_f32(&self, dd: &VfpReg, sm: &VfpReg) -> Result<Vec<u8>> {
5665        let dd_num = vfp_dreg_to_num(dd)?;
5666        let sm_num = vfp_sreg_to_num(sm)?;
5667        let (vd, d) = encode_dreg(dd_num);
5668        let (vm, m) = encode_sreg(sm_num);
5669
5670        let vcvt = 0xEEB70AC0 | (d << 22) | (vd << 12) | (m << 5) | vm;
5671        Ok(vfp_to_thumb_bytes(vcvt))
5672    }
5673
5674    /// Encode VCVT.S32/U32.F64 S0, Dm + VMOV Rd, S0 as Thumb-2
5675    fn encode_thumb_i32_trunc_f64(&self, rd: &Reg, dm: &VfpReg, signed: bool) -> Result<Vec<u8>> {
5676        let mut bytes = Vec::new();
5677        let dm_num = vfp_dreg_to_num(dm)?;
5678        let (vm, m) = encode_dreg(dm_num);
5679
5680        // VCVT.S32.F64 S0, Dm or VCVT.U32.F64 S0, Dm
5681        let base = if signed { 0xEEBD0BC0 } else { 0xEEBC0BC0 };
5682        let vcvt = base | (m << 5) | vm;
5683        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt));
5684
5685        // VMOV Rd, S0
5686        let vmov = encode_vmov_core_sreg(false, &VfpReg::S0, rd)?;
5687        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov));
5688
5689        Ok(bytes)
5690    }
5691
5692    /// Encode F64 rounding pseudo-op as Thumb-2 via VCVT to integer and back
5693    /// Encode F64 rounding as Thumb-2.
5694    /// `mode`: FPSCR RMode — 0b00=nearest, 0b01=+inf(ceil), 0b10=-inf(floor), 0b11=zero(trunc)
5695    fn encode_thumb_f64_rounding(&self, dd: &VfpReg, dm: &VfpReg, mode: u8) -> Result<Vec<u8>> {
5696        let mut bytes = Vec::new();
5697        let dm_num = vfp_dreg_to_num(dm)?;
5698        let dd_num = vfp_dreg_to_num(dd)?;
5699        let (vm, m) = encode_dreg(dm_num);
5700        let (vd, d) = encode_dreg(dd_num);
5701
5702        if mode == 0b11 {
5703            // Trunc: VCVTR.S32.F64 — bit[7]=1, always truncates
5704            let vcvt_to_int = 0xEEBD0BC0 | (m << 5) | vm;
5705            bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_int));
5706        } else {
5707            let rt: u32 = 12;
5708
5709            // VMRS R12, FPSCR
5710            let vmrs = 0xEEF10A10 | (rt << 12);
5711            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmrs));
5712
5713            // BIC.W R12, R12, #(3 << 22)
5714            let bic_hw1: u16 = 0xF020 | ((rt as u16) & 0xF);
5715            let bic_hw2: u16 = (0x05 << 12) | ((rt as u16) << 8) | 0x03;
5716            bytes.extend_from_slice(&bic_hw1.to_le_bytes());
5717            bytes.extend_from_slice(&bic_hw2.to_le_bytes());
5718
5719            // ORR.W R12, R12, #(mode << 22)
5720            if mode != 0 {
5721                let orr_hw1: u16 = 0xF040 | ((rt as u16) & 0xF);
5722                let orr_hw2: u16 = (0x05 << 12) | ((rt as u16) << 8) | (mode as u16);
5723                bytes.extend_from_slice(&orr_hw1.to_le_bytes());
5724                bytes.extend_from_slice(&orr_hw2.to_le_bytes());
5725            }
5726
5727            // VMSR FPSCR, R12
5728            let vmsr = 0xEEE10A10 | (rt << 12);
5729            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmsr));
5730
5731            // VCVT.S32.F64 S0, Dm — non-R variant (bit[7]=0)
5732            let vcvt_to_int = 0xEEBD0B40 | (m << 5) | vm;
5733            bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_int));
5734
5735            // Restore FPSCR
5736            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmrs));
5737            bytes.extend_from_slice(&bic_hw1.to_le_bytes());
5738            bytes.extend_from_slice(&bic_hw2.to_le_bytes());
5739            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmsr));
5740        }
5741
5742        // VCVT.F64.S32 Dd, S0
5743        let vcvt_to_float = 0xEEB80B40 | (d << 22) | (vd << 12);
5744        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_float));
5745
5746        Ok(bytes)
5747    }
5748
5749    /// Encode F64 min/max as Thumb-2
5750    fn encode_thumb_f64_minmax(
5751        &self,
5752        dd: &VfpReg,
5753        dn: &VfpReg,
5754        dm: &VfpReg,
5755        is_min: bool,
5756    ) -> Result<Vec<u8>> {
5757        let mut bytes = Vec::new();
5758        let dn_num = vfp_dreg_to_num(dn)?;
5759        let dm_num = vfp_dreg_to_num(dm)?;
5760        let dd_num = vfp_dreg_to_num(dd)?;
5761
5762        // VMOV.F64 Dd, Dn
5763        let (vd, d) = encode_dreg(dd_num);
5764        let (vn, n) = encode_dreg(dn_num);
5765        let vmov_dn = 0xEEB00B40 | (d << 22) | (vd << 12) | (n << 5) | vn;
5766        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov_dn));
5767
5768        // VCMP.F64 Dn, Dm
5769        let (vm, m) = encode_dreg(dm_num);
5770        let vcmp = 0xEEB40B40 | (n << 22) | (vn << 12) | (m << 5) | vm;
5771        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcmp));
5772
5773        // VMRS APSR_nzcv, FPSCR
5774        bytes.extend_from_slice(&vfp_to_thumb_bytes(0xEEF1FA10));
5775
5776        // IT GT (for min) or IT MI (for max)
5777        let cond: u16 = if is_min { 0xC } else { 0x4 };
5778        let it: u16 = 0xBF00 | (cond << 4) | 0x8;
5779        bytes.extend_from_slice(&it.to_le_bytes());
5780
5781        // VMOV{cond}.F64 Dd, Dm
5782        let vmov_dm = 0xEEB00B40 | (d << 22) | (vd << 12) | (m << 5) | vm;
5783        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov_dm));
5784
5785        Ok(bytes)
5786    }
5787
5788    /// Encode F64 copysign as Thumb-2
5789    fn encode_thumb_f64_copysign(&self, dd: &VfpReg, dn: &VfpReg, dm: &VfpReg) -> Result<Vec<u8>> {
5790        let mut bytes = Vec::new();
5791
5792        // VMOV R0, R12, Dm (get sign source)
5793        bytes.extend_from_slice(&vfp_to_thumb_bytes(encode_vmov_core_dreg(
5794            false,
5795            dm,
5796            &Reg::R0,
5797            &Reg::R12,
5798        )?));
5799
5800        // VMOV R1, R2, Dn (get magnitude source)
5801        bytes.extend_from_slice(&vfp_to_thumb_bytes(encode_vmov_core_dreg(
5802            false,
5803            dn,
5804            &Reg::R1,
5805            &Reg::R2,
5806        )?));
5807
5808        // AND.W R12, R12, #0x80000000 (i=0, Rn=R12)
5809        let hw1: u16 = 0xF000 | 12;
5810        let hw2: u16 = (0x1 << 12) | (12 << 8) | 0x02;
5811        bytes.extend_from_slice(&hw1.to_le_bytes());
5812        bytes.extend_from_slice(&hw2.to_le_bytes());
5813
5814        // BIC.W R2, R2, #0x80000000 (i=0, Rn=R2)
5815        let hw1: u16 = 0xF020 | 2;
5816        let hw2: u16 = (0x1 << 12) | (2 << 8) | 0x02;
5817        bytes.extend_from_slice(&hw1.to_le_bytes());
5818        bytes.extend_from_slice(&hw2.to_le_bytes());
5819
5820        // ORR.W R2, R2, R12
5821        let hw1: u16 = 0xEA40 | 2;
5822        let hw2: u16 = (2 << 8) | 12;
5823        bytes.extend_from_slice(&hw1.to_le_bytes());
5824        bytes.extend_from_slice(&hw2.to_le_bytes());
5825
5826        // VMOV Dd, R1, R2
5827        bytes.extend_from_slice(&vfp_to_thumb_bytes(encode_vmov_core_dreg(
5828            true,
5829            dd,
5830            &Reg::R1,
5831            &Reg::R2,
5832        )?));
5833
5834        Ok(bytes)
5835    }
5836
5837    /// Encode VCVT.S32/U32.F32 + VMOV as Thumb-2
5838    fn encode_thumb_i32_trunc_f32(&self, rd: &Reg, sm: &VfpReg, signed: bool) -> Result<Vec<u8>> {
5839        let mut bytes = Vec::new();
5840
5841        let sm_num = vfp_sreg_to_num(sm)?;
5842        let (vd, d) = encode_sreg(sm_num);
5843        let (vm, m) = encode_sreg(sm_num);
5844        let base = if signed { 0xEEBD0AC0 } else { 0xEEBC0AC0 };
5845        let vcvt = base | (d << 22) | (vd << 12) | (m << 5) | vm;
5846        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt));
5847
5848        // VMOV Rd, Sm
5849        let vmov = encode_vmov_core_sreg(false, sm, rd)?;
5850        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov));
5851
5852        Ok(bytes)
5853    }
5854
5855    // === Thumb-2 32-bit encoding helpers ===
5856
5857    /// Encode Thumb-2 32-bit ADD with immediate
5858    fn encode_thumb32_add(&self, rd: &Reg, rn: &Reg, imm: u32) -> Result<Vec<u8>> {
5859        let rd_bits = reg_to_bits(rd);
5860        let rn_bits = reg_to_bits(rn);
5861
5862        // ADD.W Rd, Rn, #imm12
5863        // First halfword: 1111 0 i 0 1000 S Rn
5864        // Second halfword: 0 imm3 Rd imm8
5865        let i_bit = (imm >> 11) & 1;
5866        let imm3 = (imm >> 8) & 0x7;
5867        let imm8 = imm & 0xFF;
5868
5869        let hw1: u16 = (0xF100 | (i_bit << 10) | rn_bits) as u16;
5870        let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
5871
5872        let mut bytes = hw1.to_le_bytes().to_vec();
5873        bytes.extend_from_slice(&hw2.to_le_bytes());
5874        Ok(bytes)
5875    }
5876
5877    /// Encode Thumb-2 32-bit SUB with immediate
5878    fn encode_thumb32_sub(&self, rd: &Reg, rn: &Reg, imm: u32) -> Result<Vec<u8>> {
5879        let rd_bits = reg_to_bits(rd);
5880        let rn_bits = reg_to_bits(rn);
5881
5882        let i_bit = (imm >> 11) & 1;
5883        let imm3 = (imm >> 8) & 0x7;
5884        let imm8 = imm & 0xFF;
5885
5886        let hw1: u16 = (0xF1A0 | (i_bit << 10) | rn_bits) as u16;
5887        let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
5888
5889        let mut bytes = hw1.to_le_bytes().to_vec();
5890        bytes.extend_from_slice(&hw2.to_le_bytes());
5891        Ok(bytes)
5892    }
5893
5894    /// Encode Thumb-2 32-bit ADDS with immediate (sets flags)
5895    fn encode_thumb32_adds(&self, rd: &Reg, rn: &Reg, imm: u32) -> Result<Vec<u8>> {
5896        let rd_bits = reg_to_bits(rd);
5897        let rn_bits = reg_to_bits(rn);
5898
5899        let i_bit = (imm >> 11) & 1;
5900        let imm3 = (imm >> 8) & 0x7;
5901        let imm8 = imm & 0xFF;
5902
5903        // ADDS.W Rd, Rn, #imm (with S=1)
5904        // First halfword: 1111 0 i 0 1000 1 Rn = F110 | i<<10 | Rn
5905        let hw1: u16 = (0xF110 | (i_bit << 10) | rn_bits) as u16;
5906        let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
5907
5908        let mut bytes = hw1.to_le_bytes().to_vec();
5909        bytes.extend_from_slice(&hw2.to_le_bytes());
5910        Ok(bytes)
5911    }
5912
5913    /// Encode Thumb-2 32-bit SUBS with immediate (sets flags)
5914    fn encode_thumb32_subs(&self, rd: &Reg, rn: &Reg, imm: u32) -> Result<Vec<u8>> {
5915        let rd_bits = reg_to_bits(rd);
5916        let rn_bits = reg_to_bits(rn);
5917
5918        let i_bit = (imm >> 11) & 1;
5919        let imm3 = (imm >> 8) & 0x7;
5920        let imm8 = imm & 0xFF;
5921
5922        // SUBS.W Rd, Rn, #imm (with S=1)
5923        // First halfword: 1111 0 i 0 1101 1 Rn = F1B0 | i<<10 | Rn
5924        let hw1: u16 = (0xF1B0 | (i_bit << 10) | rn_bits) as u16;
5925        let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
5926
5927        let mut bytes = hw1.to_le_bytes().to_vec();
5928        bytes.extend_from_slice(&hw2.to_le_bytes());
5929        Ok(bytes)
5930    }
5931
5932    /// Encode Thumb-2 32-bit MOVW (16-bit immediate)
5933    ///
5934    /// # Contract (Verus-style)
5935    /// ```text
5936    /// requires rd <= R14
5937    /// ensures result.len() == 4
5938    /// ensures (imm & 0xFFFF) can be reconstructed from the encoding
5939    /// ```
5940    fn encode_thumb32_movw(&self, rd: &Reg, imm: u32) -> Result<Vec<u8>> {
5941        let rd_bits = reg_to_bits(rd);
5942        reg_bits_checked(rd_bits)?;
5943        let imm16 = imm & 0xFFFF;
5944
5945        // MOVW Rd, #imm16
5946        // 1111 0 i 10 0 1 0 0 imm4 | 0 imm3 Rd imm8
5947        let imm4 = (imm16 >> 12) & 0xF;
5948        let i_bit = (imm16 >> 11) & 1;
5949        let imm3 = (imm16 >> 8) & 0x7;
5950        let imm8 = imm16 & 0xFF;
5951
5952        let hw1: u16 = (0xF240 | (i_bit << 10) | imm4) as u16;
5953        let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
5954
5955        let mut bytes = hw1.to_le_bytes().to_vec();
5956        bytes.extend_from_slice(&hw2.to_le_bytes());
5957        encoding_contracts::verify_thumb32(&bytes);
5958        Ok(bytes)
5959    }
5960
5961    /// Encode Thumb-2 32-bit shift with immediate
5962    ///
5963    /// # Contract (Verus-style)
5964    /// ```text
5965    /// requires rd <= R14, rm <= R14
5966    /// ensures result.len() == 4
5967    /// ```
5968    fn encode_thumb32_shift(
5969        &self,
5970        rd: &Reg,
5971        rm: &Reg,
5972        shift: u32,
5973        shift_type: u8,
5974    ) -> Result<Vec<u8>> {
5975        let rd_bits = reg_to_bits(rd);
5976        let rm_bits = reg_to_bits(rm);
5977        reg_bits_checked(rd_bits)?;
5978        reg_bits_checked(rm_bits)?;
5979        let imm5 = shift & 0x1F;
5980        let imm2 = imm5 & 0x3;
5981        let imm3 = (imm5 >> 2) & 0x7;
5982
5983        // MOV.W Rd, Rm, <shift> #imm
5984        // EA4F 0 imm3 Rd imm2 type Rm
5985        let hw1: u16 = 0xEA4F;
5986        let hw2: u16 =
5987            ((imm3 << 12) | (rd_bits << 8) | (imm2 << 6) | ((shift_type as u32) << 4) | rm_bits)
5988                as u16;
5989
5990        let mut bytes = hw1.to_le_bytes().to_vec();
5991        bytes.extend_from_slice(&hw2.to_le_bytes());
5992        Ok(bytes)
5993    }
5994
5995    /// Encode Thumb-2 32-bit shift by register
5996    /// Encoding: 11111010 0xx0 Rn | 1111 Rd 0000 Rm
5997    /// shift_type: 00=LSL, 01=LSR, 10=ASR, 11=ROR
5998    fn encode_thumb32_shift_reg(
5999        &self,
6000        rd: &Reg,
6001        rn: &Reg,
6002        rm: &Reg,
6003        shift_type: u8,
6004    ) -> Result<Vec<u8>> {
6005        let rd_bits = reg_to_bits(rd);
6006        let rn_bits = reg_to_bits(rn);
6007        let rm_bits = reg_to_bits(rm);
6008
6009        // hw1: 1111 1010 0xx0 Rn
6010        let hw1: u16 = (0xFA00 | ((shift_type as u32) << 5) | rn_bits) as u16;
6011        // hw2: 1111 Rd 0000 Rm
6012        let hw2: u16 = (0xF000 | (rd_bits << 8) | rm_bits) as u16;
6013
6014        let mut bytes = hw1.to_le_bytes().to_vec();
6015        bytes.extend_from_slice(&hw2.to_le_bytes());
6016        Ok(bytes)
6017    }
6018
6019    /// Encode Thumb-2 32-bit CMP with immediate
6020    fn encode_thumb32_cmp_imm(&self, rn: &Reg, imm: u32) -> Result<Vec<u8>> {
6021        let rn_bits = reg_to_bits(rn);
6022
6023        let i_bit = (imm >> 11) & 1;
6024        let imm3 = (imm >> 8) & 0x7;
6025        let imm8 = imm & 0xFF;
6026
6027        // CMP.W Rn, #imm
6028        let hw1: u16 = (0xF1B0 | (i_bit << 10) | rn_bits) as u16;
6029        let hw2: u16 = ((imm3 << 12) | 0x0F00 | imm8) as u16;
6030
6031        let mut bytes = hw1.to_le_bytes().to_vec();
6032        bytes.extend_from_slice(&hw2.to_le_bytes());
6033        Ok(bytes)
6034    }
6035
6036    /// Encode Thumb-2 32-bit LDR
6037    fn encode_thumb32_ldr(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6038        let rd_bits = reg_to_bits(rd);
6039        let base_bits = reg_to_bits(base);
6040
6041        // LDR.W Rd, [Rn, #imm12]
6042        let hw1: u16 = (0xF8D0 | base_bits) as u16;
6043        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6044
6045        let mut bytes = hw1.to_le_bytes().to_vec();
6046        bytes.extend_from_slice(&hw2.to_le_bytes());
6047        Ok(bytes)
6048    }
6049
6050    /// Encode Thumb-2 32-bit STR
6051    fn encode_thumb32_str(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6052        let rd_bits = reg_to_bits(rd);
6053        let base_bits = reg_to_bits(base);
6054
6055        // STR.W Rd, [Rn, #imm12]
6056        let hw1: u16 = (0xF8C0 | base_bits) as u16;
6057        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6058
6059        let mut bytes = hw1.to_le_bytes().to_vec();
6060        bytes.extend_from_slice(&hw2.to_le_bytes());
6061        Ok(bytes)
6062    }
6063
6064    /// Encode Thumb-2 32-bit LDR with register offset: LDR.W Rd, [Rn, Rm]
6065    fn encode_thumb32_ldr_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6066        let rd_bits = reg_to_bits(rd);
6067        let base_bits = reg_to_bits(base);
6068        let rm_bits = reg_to_bits(offset_reg);
6069
6070        // LDR.W Rd, [Rn, Rm, LSL #0]
6071        // Encoding: 1111 1000 0101 Rn | Rt 0000 00 imm2 Rm
6072        // imm2 = 00 for no shift (LSL #0)
6073        let hw1: u16 = (0xF850 | base_bits) as u16;
6074        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6075
6076        let mut bytes = hw1.to_le_bytes().to_vec();
6077        bytes.extend_from_slice(&hw2.to_le_bytes());
6078        Ok(bytes)
6079    }
6080
6081    /// Encode Thumb-2 32-bit STR with register offset: STR.W Rd, [Rn, Rm]
6082    fn encode_thumb32_str_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6083        let rd_bits = reg_to_bits(rd);
6084        let base_bits = reg_to_bits(base);
6085        let rm_bits = reg_to_bits(offset_reg);
6086
6087        // STR.W Rd, [Rn, Rm, LSL #0]
6088        // Encoding: 1111 1000 0100 Rn | Rt 0000 00 imm2 Rm
6089        // imm2 = 00 for no shift (LSL #0)
6090        let hw1: u16 = (0xF840 | base_bits) as u16;
6091        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6092
6093        let mut bytes = hw1.to_le_bytes().to_vec();
6094        bytes.extend_from_slice(&hw2.to_le_bytes());
6095        Ok(bytes)
6096    }
6097
6098    // === Sub-word load/store Thumb-2 encoding helpers ===
6099
6100    /// Encode Thumb-2 32-bit LDRB with immediate: LDRB.W Rd, [Rn, #imm12]
6101    fn encode_thumb32_ldrb_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6102        let rd_bits = reg_to_bits(rd);
6103        let base_bits = reg_to_bits(base);
6104        // LDRB.W Rd, [Rn, #imm12]: 1111 1000 1001 Rn | Rt imm12
6105        let hw1: u16 = (0xF890 | base_bits) as u16;
6106        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6107        let mut bytes = hw1.to_le_bytes().to_vec();
6108        bytes.extend_from_slice(&hw2.to_le_bytes());
6109        Ok(bytes)
6110    }
6111
6112    /// Encode Thumb-2 32-bit LDRB with register: LDRB.W Rd, [Rn, Rm]
6113    fn encode_thumb32_ldrb_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6114        let rd_bits = reg_to_bits(rd);
6115        let base_bits = reg_to_bits(base);
6116        let rm_bits = reg_to_bits(offset_reg);
6117        // LDRB.W Rd, [Rn, Rm, LSL #0]: 1111 1000 0001 Rn | Rt 0000 00 imm2 Rm
6118        let hw1: u16 = (0xF810 | base_bits) as u16;
6119        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6120        let mut bytes = hw1.to_le_bytes().to_vec();
6121        bytes.extend_from_slice(&hw2.to_le_bytes());
6122        Ok(bytes)
6123    }
6124
6125    /// Encode Thumb-2 32-bit LDRSB with immediate: LDRSB.W Rd, [Rn, #imm12]
6126    fn encode_thumb32_ldrsb_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6127        let rd_bits = reg_to_bits(rd);
6128        let base_bits = reg_to_bits(base);
6129        // LDRSB.W Rd, [Rn, #imm12]: 1111 1001 1001 Rn | Rt imm12
6130        let hw1: u16 = (0xF990 | base_bits) as u16;
6131        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6132        let mut bytes = hw1.to_le_bytes().to_vec();
6133        bytes.extend_from_slice(&hw2.to_le_bytes());
6134        Ok(bytes)
6135    }
6136
6137    /// Encode Thumb-2 32-bit LDRSB with register: LDRSB.W Rd, [Rn, Rm]
6138    fn encode_thumb32_ldrsb_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6139        let rd_bits = reg_to_bits(rd);
6140        let base_bits = reg_to_bits(base);
6141        let rm_bits = reg_to_bits(offset_reg);
6142        // LDRSB.W Rd, [Rn, Rm, LSL #0]: 1111 1001 0001 Rn | Rt 0000 00 imm2 Rm
6143        let hw1: u16 = (0xF910 | base_bits) as u16;
6144        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6145        let mut bytes = hw1.to_le_bytes().to_vec();
6146        bytes.extend_from_slice(&hw2.to_le_bytes());
6147        Ok(bytes)
6148    }
6149
6150    /// Encode Thumb-2 32-bit LDRH with immediate: LDRH.W Rd, [Rn, #imm12]
6151    fn encode_thumb32_ldrh_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6152        let rd_bits = reg_to_bits(rd);
6153        let base_bits = reg_to_bits(base);
6154        // LDRH.W Rd, [Rn, #imm12]: 1111 1000 1011 Rn | Rt imm12
6155        let hw1: u16 = (0xF8B0 | base_bits) as u16;
6156        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6157        let mut bytes = hw1.to_le_bytes().to_vec();
6158        bytes.extend_from_slice(&hw2.to_le_bytes());
6159        Ok(bytes)
6160    }
6161
6162    /// Encode Thumb-2 32-bit LDRH with register: LDRH.W Rd, [Rn, Rm]
6163    fn encode_thumb32_ldrh_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6164        let rd_bits = reg_to_bits(rd);
6165        let base_bits = reg_to_bits(base);
6166        let rm_bits = reg_to_bits(offset_reg);
6167        // LDRH.W Rd, [Rn, Rm, LSL #0]: 1111 1000 0011 Rn | Rt 0000 00 imm2 Rm
6168        let hw1: u16 = (0xF830 | base_bits) as u16;
6169        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6170        let mut bytes = hw1.to_le_bytes().to_vec();
6171        bytes.extend_from_slice(&hw2.to_le_bytes());
6172        Ok(bytes)
6173    }
6174
6175    /// Encode Thumb-2 32-bit LDRSH with immediate: LDRSH.W Rd, [Rn, #imm12]
6176    fn encode_thumb32_ldrsh_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6177        let rd_bits = reg_to_bits(rd);
6178        let base_bits = reg_to_bits(base);
6179        // LDRSH.W Rd, [Rn, #imm12]: 1111 1001 1011 Rn | Rt imm12
6180        let hw1: u16 = (0xF9B0 | base_bits) as u16;
6181        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6182        let mut bytes = hw1.to_le_bytes().to_vec();
6183        bytes.extend_from_slice(&hw2.to_le_bytes());
6184        Ok(bytes)
6185    }
6186
6187    /// Encode Thumb-2 32-bit LDRSH with register: LDRSH.W Rd, [Rn, Rm]
6188    fn encode_thumb32_ldrsh_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6189        let rd_bits = reg_to_bits(rd);
6190        let base_bits = reg_to_bits(base);
6191        let rm_bits = reg_to_bits(offset_reg);
6192        // LDRSH.W Rd, [Rn, Rm, LSL #0]: 1111 1001 0011 Rn | Rt 0000 00 imm2 Rm
6193        let hw1: u16 = (0xF930 | base_bits) as u16;
6194        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6195        let mut bytes = hw1.to_le_bytes().to_vec();
6196        bytes.extend_from_slice(&hw2.to_le_bytes());
6197        Ok(bytes)
6198    }
6199
6200    /// Encode Thumb-2 32-bit STRB with immediate: STRB.W Rd, [Rn, #imm12]
6201    fn encode_thumb32_strb_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6202        let rd_bits = reg_to_bits(rd);
6203        let base_bits = reg_to_bits(base);
6204        // STRB.W Rd, [Rn, #imm12]: 1111 1000 1000 Rn | Rt imm12
6205        let hw1: u16 = (0xF880 | base_bits) as u16;
6206        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6207        let mut bytes = hw1.to_le_bytes().to_vec();
6208        bytes.extend_from_slice(&hw2.to_le_bytes());
6209        Ok(bytes)
6210    }
6211
6212    /// Encode Thumb-2 32-bit STRB with register: STRB.W Rd, [Rn, Rm]
6213    fn encode_thumb32_strb_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6214        let rd_bits = reg_to_bits(rd);
6215        let base_bits = reg_to_bits(base);
6216        let rm_bits = reg_to_bits(offset_reg);
6217        // STRB.W Rd, [Rn, Rm, LSL #0]: 1111 1000 0000 Rn | Rt 0000 00 imm2 Rm
6218        let hw1: u16 = (0xF800 | base_bits) as u16;
6219        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6220        let mut bytes = hw1.to_le_bytes().to_vec();
6221        bytes.extend_from_slice(&hw2.to_le_bytes());
6222        Ok(bytes)
6223    }
6224
6225    /// Encode Thumb-2 32-bit STRH with immediate: STRH.W Rd, [Rn, #imm12]
6226    fn encode_thumb32_strh_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6227        let rd_bits = reg_to_bits(rd);
6228        let base_bits = reg_to_bits(base);
6229        // STRH.W Rd, [Rn, #imm12]: 1111 1000 1010 Rn | Rt imm12
6230        let hw1: u16 = (0xF8A0 | base_bits) as u16;
6231        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6232        let mut bytes = hw1.to_le_bytes().to_vec();
6233        bytes.extend_from_slice(&hw2.to_le_bytes());
6234        Ok(bytes)
6235    }
6236
6237    /// Encode Thumb-2 32-bit STRH with register: STRH.W Rd, [Rn, Rm]
6238    fn encode_thumb32_strh_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6239        let rd_bits = reg_to_bits(rd);
6240        let base_bits = reg_to_bits(base);
6241        let rm_bits = reg_to_bits(offset_reg);
6242        // STRH.W Rd, [Rn, Rm, LSL #0]: 1111 1000 0010 Rn | Rt 0000 00 imm2 Rm
6243        let hw1: u16 = (0xF820 | base_bits) as u16;
6244        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6245        let mut bytes = hw1.to_le_bytes().to_vec();
6246        bytes.extend_from_slice(&hw2.to_le_bytes());
6247        Ok(bytes)
6248    }
6249
6250    /// Encode Thumb-2 32-bit ADD with immediate: ADD.W Rd, Rn, #imm
6251    fn encode_thumb32_add_imm(&self, rd: &Reg, rn: &Reg, imm: u32) -> Result<Vec<u8>> {
6252        let rd_bits = reg_to_bits(rd);
6253        let rn_bits = reg_to_bits(rn);
6254
6255        // For small immediates, use ADD.W Rd, Rn, #imm12
6256        // Encoding: 1111 0 i 0 1 0 0 0 S Rn | 0 imm3 Rd imm8
6257        // S = 0 (don't update flags)
6258        // The 12-bit immediate is encoded as: i:imm3:imm8
6259        // For simplicity, we only support imm <= 0xFFF (direct encoding)
6260        if imm <= 0xFFF {
6261            let i_bit = (imm >> 11) & 1;
6262            let imm3 = (imm >> 8) & 0x7;
6263            let imm8 = imm & 0xFF;
6264
6265            let hw1: u16 = (0xF100 | (i_bit << 10) | rn_bits) as u16;
6266            let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
6267
6268            let mut bytes = hw1.to_le_bytes().to_vec();
6269            bytes.extend_from_slice(&hw2.to_le_bytes());
6270            Ok(bytes)
6271        } else {
6272            // For larger immediates, would need MOVW/MOVT + ADD
6273            // For now, return error
6274            Err(synth_core::Error::synthesis(
6275                "ADD immediate too large for single instruction",
6276            ))
6277        }
6278    }
6279
6280    // === Raw encoding helpers for POPCNT (take register numbers directly) ===
6281
6282    /// Encode Thumb-2 32-bit MOVW (16-bit immediate) - raw version
6283    ///
6284    /// # Contract (Verus-style)
6285    /// ```text
6286    /// requires rd <= 14, imm16 <= 0xFFFF
6287    /// ensures result.len() == 4
6288    /// ```
6289    fn encode_thumb32_movw_raw(&self, rd: u32, imm16: u32) -> Result<Vec<u8>> {
6290        reg_bits_checked(rd)?;
6291        encoding_contracts::verify_imm16(imm16);
6292        // MOVW Rd, #imm16
6293        // 1111 0 i 10 0 1 0 0 imm4 | 0 imm3 Rd imm8
6294        let imm16 = imm16 & 0xFFFF;
6295        let imm4 = (imm16 >> 12) & 0xF;
6296        let i_bit = (imm16 >> 11) & 1;
6297        let imm3 = (imm16 >> 8) & 0x7;
6298        let imm8 = imm16 & 0xFF;
6299
6300        let hw1: u16 = (0xF240 | (i_bit << 10) | imm4) as u16;
6301        let hw2: u16 = ((imm3 << 12) | (rd << 8) | imm8) as u16;
6302
6303        let mut bytes = hw1.to_le_bytes().to_vec();
6304        bytes.extend_from_slice(&hw2.to_le_bytes());
6305        encoding_contracts::verify_thumb32(&bytes);
6306        Ok(bytes)
6307    }
6308
6309    /// Encode Thumb-2 32-bit MOVT (move top 16 bits) - raw version
6310    ///
6311    /// # Contract (Verus-style)
6312    /// ```text
6313    /// requires rd <= 14, imm16 <= 0xFFFF
6314    /// ensures result.len() == 4
6315    /// ```
6316    fn encode_thumb32_movt_raw(&self, rd: u32, imm16: u32) -> Result<Vec<u8>> {
6317        reg_bits_checked(rd)?;
6318        encoding_contracts::verify_imm16(imm16);
6319        // MOVT Rd, #imm16
6320        // 1111 0 i 10 1 1 0 0 imm4 | 0 imm3 Rd imm8
6321        let imm16 = imm16 & 0xFFFF;
6322        let imm4 = (imm16 >> 12) & 0xF;
6323        let i_bit = (imm16 >> 11) & 1;
6324        let imm3 = (imm16 >> 8) & 0x7;
6325        let imm8 = imm16 & 0xFF;
6326
6327        let hw1: u16 = (0xF2C0 | (i_bit << 10) | imm4) as u16;
6328        let hw2: u16 = ((imm3 << 12) | (rd << 8) | imm8) as u16;
6329
6330        let mut bytes = hw1.to_le_bytes().to_vec();
6331        bytes.extend_from_slice(&hw2.to_le_bytes());
6332        encoding_contracts::verify_thumb32(&bytes);
6333        Ok(bytes)
6334    }
6335
6336    /// Encode Thumb-2 32-bit LSR (logical shift right) with immediate - raw version
6337    fn encode_thumb32_lsr_raw(&self, rd: u32, rm: u32, shift: u32) -> Result<Vec<u8>> {
6338        // MOV.W Rd, Rm, LSR #imm
6339        // EA4F 0 imm3 Rd imm2 01 Rm
6340        let imm5 = shift & 0x1F;
6341        let imm2 = imm5 & 0x3;
6342        let imm3 = (imm5 >> 2) & 0x7;
6343
6344        let hw1: u16 = 0xEA4F;
6345        let hw2: u16 = ((imm3 << 12) | (rd << 8) | (imm2 << 6) | (0b01 << 4) | rm) as u16;
6346
6347        let mut bytes = hw1.to_le_bytes().to_vec();
6348        bytes.extend_from_slice(&hw2.to_le_bytes());
6349        Ok(bytes)
6350    }
6351
6352    /// Encode Thumb-2 32-bit AND (register) - raw version
6353    fn encode_thumb32_and_reg_raw(&self, rd: u32, rn: u32, rm: u32) -> Result<Vec<u8>> {
6354        // AND.W Rd, Rn, Rm
6355        // EA00 Rn | 0 Rd 00 00 Rm
6356        let hw1: u16 = (0xEA00 | rn) as u16;
6357        let hw2: u16 = ((rd << 8) | rm) as u16;
6358
6359        let mut bytes = hw1.to_le_bytes().to_vec();
6360        bytes.extend_from_slice(&hw2.to_le_bytes());
6361        Ok(bytes)
6362    }
6363
6364    /// Encode Thumb-2 32-bit AND with immediate - raw version
6365    fn encode_thumb32_and_imm_raw(&self, rd: u32, rn: u32, imm: u32) -> Result<Vec<u8>> {
6366        // AND.W Rd, Rn, #<modified_immediate>
6367        // For small immediates (0-255), the encoding is simpler
6368        // F0 00 Rn | 0 imm3 Rd imm8
6369        let i_bit = (imm >> 11) & 1;
6370        let imm3 = (imm >> 8) & 0x7;
6371        let imm8 = imm & 0xFF;
6372
6373        let hw1: u16 = (0xF000 | (i_bit << 10) | rn) as u16;
6374        let hw2: u16 = ((imm3 << 12) | (rd << 8) | imm8) as u16;
6375
6376        let mut bytes = hw1.to_le_bytes().to_vec();
6377        bytes.extend_from_slice(&hw2.to_le_bytes());
6378        Ok(bytes)
6379    }
6380
6381    /// Encode Thumb-2 32-bit SUB (register) - raw version
6382    fn encode_thumb32_sub_reg_raw(&self, rd: u32, rn: u32, rm: u32) -> Result<Vec<u8>> {
6383        // SUB.W Rd, Rn, Rm
6384        // EBA0 Rn | 0 Rd 00 00 Rm
6385        let hw1: u16 = (0xEBA0 | rn) as u16;
6386        let hw2: u16 = ((rd << 8) | rm) as u16;
6387
6388        let mut bytes = hw1.to_le_bytes().to_vec();
6389        bytes.extend_from_slice(&hw2.to_le_bytes());
6390        Ok(bytes)
6391    }
6392
6393    /// Encode Thumb-2 32-bit ADD (register) - raw version
6394    fn encode_thumb32_add_reg_raw(&self, rd: u32, rn: u32, rm: u32) -> Result<Vec<u8>> {
6395        // ADD.W Rd, Rn, Rm
6396        // EB00 Rn | 0 Rd 00 00 Rm
6397        let hw1: u16 = (0xEB00 | rn) as u16;
6398        let hw2: u16 = ((rd << 8) | rm) as u16;
6399
6400        let mut bytes = hw1.to_le_bytes().to_vec();
6401        bytes.extend_from_slice(&hw2.to_le_bytes());
6402        Ok(bytes)
6403    }
6404
6405    /// Encode Thumb-2 32-bit ADDS (register, flag-setting) - raw version.
6406    /// Used as the high-register fallback for `ArmOp::Adds` (i64 low-word add)
6407    /// so R8-R11 pair operands don't overflow the 16-bit field — #178/#180.
6408    fn encode_thumb32_adds_reg_raw(&self, rd: u32, rn: u32, rm: u32) -> Result<Vec<u8>> {
6409        // ADDS.W Rd, Rn, Rm (T3, S=1): EB10 Rn | 0 Rd 00 00 Rm
6410        let hw1: u16 = (0xEB10 | rn) as u16;
6411        let hw2: u16 = ((rd << 8) | rm) as u16;
6412        let mut bytes = hw1.to_le_bytes().to_vec();
6413        bytes.extend_from_slice(&hw2.to_le_bytes());
6414        Ok(bytes)
6415    }
6416
6417    /// Encode Thumb-2 32-bit SUBS (register, flag-setting) - raw version.
6418    /// High-register fallback for `ArmOp::Subs` (i64 low-word subtract) — #178/#180.
6419    fn encode_thumb32_subs_reg_raw(&self, rd: u32, rn: u32, rm: u32) -> Result<Vec<u8>> {
6420        // SUBS.W Rd, Rn, Rm (T3, S=1): EBB0 Rn | 0 Rd 00 00 Rm
6421        let hw1: u16 = (0xEBB0 | rn) as u16;
6422        let hw2: u16 = ((rd << 8) | rm) as u16;
6423        let mut bytes = hw1.to_le_bytes().to_vec();
6424        bytes.extend_from_slice(&hw2.to_le_bytes());
6425        Ok(bytes)
6426    }
6427
6428    /// Encode a sequence of ARM instructions
6429    pub fn encode_sequence(&self, ops: &[ArmOp]) -> Result<Vec<u8>> {
6430        let mut code = Vec::new();
6431
6432        for op in ops {
6433            let encoded = self.encode(op)?;
6434            code.extend_from_slice(&encoded);
6435        }
6436
6437        Ok(code)
6438    }
6439}
6440
6441/// Convert register to bit encoding (0-15)
6442fn reg_to_bits(reg: &Reg) -> u32 {
6443    match reg {
6444        Reg::R0 => 0,
6445        Reg::R1 => 1,
6446        Reg::R2 => 2,
6447        Reg::R3 => 3,
6448        Reg::R4 => 4,
6449        Reg::R5 => 5,
6450        Reg::R6 => 6,
6451        Reg::R7 => 7,
6452        Reg::R8 => 8,
6453        Reg::R9 => 9,
6454        Reg::R10 => 10,
6455        Reg::R11 => 11,
6456        Reg::R12 => 12,
6457        Reg::SP => 13,
6458        Reg::LR => 14,
6459        Reg::PC => 15,
6460    }
6461}
6462
6463/// Fallible form of the `verify_reg_bits` contract. PC (R15) is not a valid
6464/// data operand for the Thumb-2 encodings that use this guard (SDIV/UDIV/MLS/…
6465/// are UNPREDICTABLE with PC). Synth's own codegen never emits PC there, but
6466/// the encoder must stay *total* over arbitrary `ArmOp` inputs — the fuzz
6467/// harness (`encoder_no_panic`) requires Ok-or-Err, never a panic. Pre-fix, the
6468/// `debug_assert` in `verify_reg_bits` aborted under `-Cdebug-assertions`.
6469/// Returns a typed Err instead. See #185.
6470fn reg_bits_checked(bits: u32) -> Result<()> {
6471    if bits > 14 {
6472        return Err(synth_core::Error::synthesis(format!(
6473            "register bits {bits} (PC/R15) is not a valid operand for this Thumb-2 encoding"
6474        )));
6475    }
6476    Ok(())
6477}
6478
6479/// Try to encode a 32-bit value as an ARM rotated immediate (imm8 ROR 2*rot4).
6480/// Returns Some((encoded_bits, 1)) if representable, None otherwise.
6481fn try_encode_rotated_imm(val: u32) -> Option<(u32, u32)> {
6482    if val == 0 {
6483        return Some((0, 1));
6484    }
6485    for rot in 0..16u32 {
6486        let shift = rot * 2;
6487        // Rotate left by shift (undo the ROR) to see if result fits in 8 bits
6488        let unrotated = val.rotate_left(shift);
6489        if unrotated <= 0xFF {
6490            // Encoded as: rot4(4 bits) | imm8(8 bits) = rotate_imm << 8 | imm8
6491            return Some(((rot << 8) | unrotated, 1));
6492        }
6493    }
6494    None
6495}
6496
6497/// Encode operand2 field and return (bits, immediate_flag).
6498/// For ARM32 mode, immediates use the rotated-immediate encoding (imm8 ROR 2*rot4).
6499/// Panics if an immediate value cannot be represented. Callers that need large
6500/// immediates should use MOVW/MOVT instead of Operand2::Imm.
6501fn encode_operand2(op2: &Operand2) -> (u32, u32) {
6502    match op2 {
6503        Operand2::Imm(val) => {
6504            let uval = *val as u32;
6505            // Attempt rotated-immediate encoding (ARM32 Operand2)
6506            if let Some(encoded) = try_encode_rotated_imm(uval) {
6507                encoded
6508            } else {
6509                // Fallback: mask to 8 bits (legacy behavior for values that
6510                // cannot be represented). This should not be reached for
6511                // correctly-selected instructions; the instruction selector
6512                // must use MOVW/MOVT for large constants.
6513                let imm = uval & 0xFF;
6514                (imm, 1)
6515            }
6516        }
6517
6518        Operand2::Reg(reg) => {
6519            let reg_bits = reg_to_bits(reg);
6520            (reg_bits, 0) // I=0 for register
6521        }
6522
6523        Operand2::RegShift {
6524            rm,
6525            shift: _,
6526            amount,
6527        } => {
6528            // Simplified encoding with shift
6529            let rm_bits = reg_to_bits(rm);
6530            let shift_bits = (*amount & 0x1F) << 7;
6531            (shift_bits | rm_bits, 0)
6532        }
6533    }
6534}
6535
6536/// Encode memory address to (base_reg, offset)
6537fn encode_mem_addr(addr: &MemAddr) -> (u32, u32) {
6538    let base_bits = reg_to_bits(&addr.base);
6539    let offset_bits = (addr.offset as u32) & 0xFFF; // 12-bit offset
6540    (base_bits, offset_bits)
6541}
6542
6543/// S-register number: S0=0, S1=1, ..., S31=31
6544fn vfp_sreg_to_num(reg: &VfpReg) -> Result<u32> {
6545    match reg {
6546        VfpReg::S0 => Ok(0),
6547        VfpReg::S1 => Ok(1),
6548        VfpReg::S2 => Ok(2),
6549        VfpReg::S3 => Ok(3),
6550        VfpReg::S4 => Ok(4),
6551        VfpReg::S5 => Ok(5),
6552        VfpReg::S6 => Ok(6),
6553        VfpReg::S7 => Ok(7),
6554        VfpReg::S8 => Ok(8),
6555        VfpReg::S9 => Ok(9),
6556        VfpReg::S10 => Ok(10),
6557        VfpReg::S11 => Ok(11),
6558        VfpReg::S12 => Ok(12),
6559        VfpReg::S13 => Ok(13),
6560        VfpReg::S14 => Ok(14),
6561        VfpReg::S15 => Ok(15),
6562        VfpReg::S16 => Ok(16),
6563        VfpReg::S17 => Ok(17),
6564        VfpReg::S18 => Ok(18),
6565        VfpReg::S19 => Ok(19),
6566        VfpReg::S20 => Ok(20),
6567        VfpReg::S21 => Ok(21),
6568        VfpReg::S22 => Ok(22),
6569        VfpReg::S23 => Ok(23),
6570        VfpReg::S24 => Ok(24),
6571        VfpReg::S25 => Ok(25),
6572        VfpReg::S26 => Ok(26),
6573        VfpReg::S27 => Ok(27),
6574        VfpReg::S28 => Ok(28),
6575        VfpReg::S29 => Ok(29),
6576        VfpReg::S30 => Ok(30),
6577        VfpReg::S31 => Ok(31),
6578        // D-registers are not used in F32 single-precision encodings
6579        _ => Err(synth_core::Error::SynthesisError(
6580            "D-register not supported in single-precision VFP encoding".to_string(),
6581        )),
6582    }
6583}
6584
6585/// D-register number: D0=0, D1=1, ..., D15=15
6586fn vfp_dreg_to_num(reg: &VfpReg) -> Result<u32> {
6587    match reg {
6588        VfpReg::D0 => Ok(0),
6589        VfpReg::D1 => Ok(1),
6590        VfpReg::D2 => Ok(2),
6591        VfpReg::D3 => Ok(3),
6592        VfpReg::D4 => Ok(4),
6593        VfpReg::D5 => Ok(5),
6594        VfpReg::D6 => Ok(6),
6595        VfpReg::D7 => Ok(7),
6596        VfpReg::D8 => Ok(8),
6597        VfpReg::D9 => Ok(9),
6598        VfpReg::D10 => Ok(10),
6599        VfpReg::D11 => Ok(11),
6600        VfpReg::D12 => Ok(12),
6601        VfpReg::D13 => Ok(13),
6602        VfpReg::D14 => Ok(14),
6603        VfpReg::D15 => Ok(15),
6604        // S-registers are not used in F64 double-precision encodings
6605        _ => Err(synth_core::Error::SynthesisError(
6606            "S-register not supported in double-precision VFP encoding".to_string(),
6607        )),
6608    }
6609}
6610
6611/// Split S-register into (Vx[3:0], qualifier_bit) for VFP encoding.
6612/// For an S-register number s: Vx = s >> 1, qualifier = s & 1.
6613/// The qualifier bit goes to D (bit 22), N (bit 7), or M (bit 5) depending on role.
6614fn encode_sreg(s: u32) -> (u32, u32) {
6615    (s >> 1, s & 1)
6616}
6617
6618/// Split D-register into (Vx[3:0], qualifier_bit) for VFP double-precision encoding.
6619/// For a D-register number d: Vx = d & 0xF, qualifier = (d >> 4) & 1.
6620/// For D0-D15, qualifier is always 0.
6621fn encode_dreg(d: u32) -> (u32, u32) {
6622    (d & 0xF, (d >> 4) & 1)
6623}
6624
6625/// Encode a VFP 3-register arithmetic instruction (VADD.F32, VSUB.F32, VMUL.F32, VDIV.F32).
6626/// Returns the full 32-bit instruction word.
6627///
6628/// VFP encoding: [cond 1110] [D opc1 Vn] [Vd 101 sz] [N opc2 M 0 Vm]
6629/// For single-precision (sz=0), coprocessor = 0xA (bits[11:8]).
6630fn encode_vfp_3reg(base: u32, sd: &VfpReg, sn: &VfpReg, sm: &VfpReg) -> Result<u32> {
6631    let sd_num = vfp_sreg_to_num(sd)?;
6632    let sn_num = vfp_sreg_to_num(sn)?;
6633    let sm_num = vfp_sreg_to_num(sm)?;
6634    let (vd, d) = encode_sreg(sd_num);
6635    let (vn, n) = encode_sreg(sn_num);
6636    let (vm, m) = encode_sreg(sm_num);
6637
6638    Ok(base | (d << 22) | (vn << 16) | (vd << 12) | (n << 7) | (m << 5) | vm)
6639}
6640
6641/// Encode a VFP 2-register instruction (VNEG.F32, VABS.F32, VSQRT.F32).
6642/// Returns the full 32-bit instruction word.
6643fn encode_vfp_2reg(base: u32, sd: &VfpReg, sm: &VfpReg) -> Result<u32> {
6644    let sd_num = vfp_sreg_to_num(sd)?;
6645    let sm_num = vfp_sreg_to_num(sm)?;
6646    let (vd, d) = encode_sreg(sd_num);
6647    let (vm, m) = encode_sreg(sm_num);
6648
6649    Ok(base | (d << 22) | (vd << 12) | (m << 5) | vm)
6650}
6651
6652/// Encode a VFP load/store (VLDR.F32 / VSTR.F32).
6653/// offset is in bytes and must be word-aligned; encoded as imm8 = offset/4.
6654/// U bit (bit 23) controls add/subtract offset.
6655fn encode_vfp_ldst(base: u32, sd: &VfpReg, addr: &MemAddr) -> Result<u32> {
6656    let sd_num = vfp_sreg_to_num(sd)?;
6657    let (vd, d) = encode_sreg(sd_num);
6658    let rn = reg_to_bits(&addr.base);
6659
6660    let offset = addr.offset;
6661    let u_bit = if offset >= 0 { 1u32 } else { 0u32 };
6662    let abs_offset = offset.unsigned_abs();
6663    let imm8 = (abs_offset / 4) & 0xFF;
6664
6665    Ok(base | (u_bit << 23) | (d << 22) | (rn << 16) | (vd << 12) | imm8)
6666}
6667
6668/// Encode VMOV between core register and S-register.
6669/// VMOV Sn, Rt: 0xEE00_0A10 | (Vn << 16) | (N << 7) | (Rt << 12)
6670/// VMOV Rt, Sn: 0xEE10_0A10 | (Vn << 16) | (N << 7) | (Rt << 12)
6671fn encode_vmov_core_sreg(to_sreg: bool, sreg: &VfpReg, core: &Reg) -> Result<u32> {
6672    let s_num = vfp_sreg_to_num(sreg)?;
6673    let (vn, n) = encode_sreg(s_num);
6674    let rt = reg_to_bits(core);
6675
6676    let base = if to_sreg { 0xEE000A10 } else { 0xEE100A10 };
6677    Ok(base | (vn << 16) | (rt << 12) | (n << 7))
6678}
6679
6680/// Encode a VFP 3-register double-precision instruction (VADD.F64, VSUB.F64, etc.).
6681/// For double-precision (sz=1), coprocessor = 0xB (bits[11:8]).
6682/// The base should have bit 8 = 1 for F64 (0xB suffix instead of 0xA).
6683fn encode_vfp_3reg_f64(base: u32, dd: &VfpReg, dn: &VfpReg, dm: &VfpReg) -> Result<u32> {
6684    let dd_num = vfp_dreg_to_num(dd)?;
6685    let dn_num = vfp_dreg_to_num(dn)?;
6686    let dm_num = vfp_dreg_to_num(dm)?;
6687    let (vd, d) = encode_dreg(dd_num);
6688    let (vn, n) = encode_dreg(dn_num);
6689    let (vm, m) = encode_dreg(dm_num);
6690
6691    Ok(base | (d << 22) | (vn << 16) | (vd << 12) | (n << 7) | (m << 5) | vm)
6692}
6693
6694/// Encode a VFP 2-register double-precision instruction (VNEG.F64, VABS.F64, VSQRT.F64).
6695fn encode_vfp_2reg_f64(base: u32, dd: &VfpReg, dm: &VfpReg) -> Result<u32> {
6696    let dd_num = vfp_dreg_to_num(dd)?;
6697    let dm_num = vfp_dreg_to_num(dm)?;
6698    let (vd, d) = encode_dreg(dd_num);
6699    let (vm, m) = encode_dreg(dm_num);
6700
6701    Ok(base | (d << 22) | (vd << 12) | (m << 5) | vm)
6702}
6703
6704/// Encode a VFP load/store for double-precision (VLDR.64 / VSTR.64).
6705/// offset is in bytes and must be word-aligned; encoded as imm8 = offset/4.
6706fn encode_vfp_ldst_f64(base: u32, dd: &VfpReg, addr: &MemAddr) -> Result<u32> {
6707    let dd_num = vfp_dreg_to_num(dd)?;
6708    let (vd, d) = encode_dreg(dd_num);
6709    let rn = reg_to_bits(&addr.base);
6710
6711    let offset = addr.offset;
6712    let u_bit = if offset >= 0 { 1u32 } else { 0u32 };
6713    let abs_offset = offset.unsigned_abs();
6714    let imm8 = (abs_offset / 4) & 0xFF;
6715
6716    Ok(base | (u_bit << 23) | (d << 22) | (rn << 16) | (vd << 12) | imm8)
6717}
6718
6719/// Encode VMOV between two core registers and a D-register.
6720/// VMOV Dm, Rt, Rt2: 0xEC40_0B10 | (Rt2 << 16) | (Rt << 12) | (M << 5) | Vm
6721/// VMOV Rt, Rt2, Dm: 0xEC50_0B10 | (Rt2 << 16) | (Rt << 12) | (M << 5) | Vm
6722fn encode_vmov_core_dreg(
6723    to_dreg: bool,
6724    dreg: &VfpReg,
6725    core_lo: &Reg,
6726    core_hi: &Reg,
6727) -> Result<u32> {
6728    let d_num = vfp_dreg_to_num(dreg)?;
6729    let (vm, m) = encode_dreg(d_num);
6730    let rt = reg_to_bits(core_lo);
6731    let rt2 = reg_to_bits(core_hi);
6732
6733    let base = if to_dreg { 0xEC400B10 } else { 0xEC500B10 };
6734    Ok(base | (rt2 << 16) | (rt << 12) | (m << 5) | vm)
6735}
6736
6737/// Emit a VFP 32-bit instruction as Thumb-2 bytes (two LE halfwords).
6738fn vfp_to_thumb_bytes(instr: u32) -> Vec<u8> {
6739    let hw1 = ((instr >> 16) & 0xFFFF) as u16;
6740    let hw2 = (instr & 0xFFFF) as u16;
6741    let mut bytes = hw1.to_le_bytes().to_vec();
6742    bytes.extend_from_slice(&hw2.to_le_bytes());
6743    bytes
6744}
6745
6746// ============================================================================
6747// Helium MVE encoding helpers
6748// ============================================================================
6749
6750/// Q-register number: Q0=0, Q1=1, ..., Q7=7
6751fn qreg_to_num(reg: &QReg) -> u32 {
6752    match reg {
6753        QReg::Q0 => 0,
6754        QReg::Q1 => 1,
6755        QReg::Q2 => 2,
6756        QReg::Q3 => 3,
6757        QReg::Q4 => 4,
6758        QReg::Q5 => 5,
6759        QReg::Q6 => 6,
6760        QReg::Q7 => 7,
6761    }
6762}
6763
6764/// MVE element size to encoding bits: S8=0b00, S16=0b01, S32=0b10
6765fn mve_size_bits(size: &MveSize) -> u32 {
6766    match size {
6767        MveSize::S8 => 0b00,
6768        MveSize::S16 => 0b01,
6769        MveSize::S32 => 0b10,
6770    }
6771}
6772
6773/// Encode MVE 3-register instruction.
6774/// Q-registers are encoded as D-register pairs: Q0=D0:D1, Q1=D2:D3, etc.
6775/// In NEON/MVE encoding, the Q-register uses D-register number = Qn * 2.
6776fn encode_mve_3reg(base: u32, qd: &QReg, qn: &QReg, qm: &QReg) -> u32 {
6777    let d = qreg_to_num(qd) * 2;
6778    let n = qreg_to_num(qn) * 2;
6779    let m = qreg_to_num(qm) * 2;
6780
6781    // Standard NEON/MVE 3-register encoding:
6782    // D bit (bit 22) = Vd[4], Vd[3:0] = bits [15:12]
6783    // N bit (bit 7)  = Vn[4], Vn[3:0] = bits [19:16]
6784    // M bit (bit 5)  = Vm[4], Vm[3:0] = bits [3:0]
6785    let vd = d & 0xF;
6786    let d_bit = (d >> 4) & 1;
6787    let vn = n & 0xF;
6788    let n_bit = (n >> 4) & 1;
6789    let vm = m & 0xF;
6790    let m_bit = (m >> 4) & 1;
6791
6792    base | (d_bit << 22) | (vn << 16) | (vd << 12) | (n_bit << 7) | (m_bit << 5) | vm
6793}
6794
6795/// Encode MVE 3-register bitwise instruction (VAND, VORR, VEOR, VBIC).
6796fn encode_mve_3reg_bitwise(base: u32, qd: &QReg, qn: &QReg, qm: &QReg) -> u32 {
6797    encode_mve_3reg(base, qd, qn, qm)
6798}
6799
6800/// Encode MVE VLDRW.32 Qd, [Rn, #offset]
6801/// Format: EC9x xxxx - contiguous load, word-sized elements
6802fn encode_mve_vldrw(qd: &QReg, addr: &MemAddr) -> u32 {
6803    let qd_enc = qreg_to_num(qd) * 2;
6804    let rn = reg_to_bits(&addr.base);
6805    let offset = addr.offset;
6806    let u_bit = if offset >= 0 { 1u32 } else { 0u32 };
6807    let abs_offset = offset.unsigned_abs();
6808    let imm7 = (abs_offset / 4) & 0x7F; // 7-bit word-aligned offset
6809
6810    // VLDRW.32 Qd, [Rn, #imm]: ED10 xx80 variant
6811    0xED100E80
6812        | (u_bit << 23)
6813        | ((qd_enc >> 4) << 22)
6814        | (rn << 16)
6815        | ((qd_enc & 0xF) << 12)
6816        | (imm7 & 0x7F)
6817}
6818
6819/// Encode MVE VSTRW.32 Qd, [Rn, #offset]
6820fn encode_mve_vstrw(qd: &QReg, addr: &MemAddr) -> u32 {
6821    let qd_enc = qreg_to_num(qd) * 2;
6822    let rn = reg_to_bits(&addr.base);
6823    let offset = addr.offset;
6824    let u_bit = if offset >= 0 { 1u32 } else { 0u32 };
6825    let abs_offset = offset.unsigned_abs();
6826    let imm7 = (abs_offset / 4) & 0x7F;
6827
6828    0xED000E80
6829        | (u_bit << 23)
6830        | ((qd_enc >> 4) << 22)
6831        | (rn << 16)
6832        | ((qd_enc & 0xF) << 12)
6833        | (imm7 & 0x7F)
6834}
6835
6836impl ArmEncoder {
6837    /// Encode MVE constant load: MOVW+MOVT+VMOV for each 32-bit word, then assemble Q-register
6838    fn encode_thumb_mve_const(&self, qd: &QReg, bytes: &[u8; 16]) -> Result<Vec<u8>> {
6839        let mut result = Vec::new();
6840        let qd_num = qreg_to_num(qd);
6841
6842        // Load each 32-bit word into R12 (temp) then VMOV into S-register
6843        for i in 0..4 {
6844            let word = u32::from_le_bytes([
6845                bytes[i * 4],
6846                bytes[i * 4 + 1],
6847                bytes[i * 4 + 2],
6848                bytes[i * 4 + 3],
6849            ]);
6850            let lo16 = word & 0xFFFF;
6851            let hi16 = (word >> 16) & 0xFFFF;
6852
6853            // MOVW R12, #lo16
6854            result.extend_from_slice(&self.encode_thumb32_movw_raw(12, lo16)?);
6855            // MOVT R12, #hi16
6856            if hi16 != 0 {
6857                result.extend_from_slice(&self.encode_thumb32_movt_raw(12, hi16)?);
6858            }
6859
6860            // VMOV Sn, R12 where Sn = Qd*4 + i
6861            let s_num = qd_num * 4 + i as u32;
6862            let (vn, n) = encode_sreg(s_num);
6863            let vmov: u32 = 0xEE000A10 | (vn << 16) | (12 << 12) | (n << 7);
6864            result.extend_from_slice(&vfp_to_thumb_bytes(vmov));
6865        }
6866
6867        Ok(result)
6868    }
6869
6870    /// Encode lane-wise f32 binary operation (VDIV, etc.) via S-register extraction
6871    fn encode_thumb_mve_lane_wise_f32_binop(
6872        &self,
6873        qd: &QReg,
6874        qn: &QReg,
6875        qm: &QReg,
6876        vfp_base: u32,
6877    ) -> Result<Vec<u8>> {
6878        let mut result = Vec::new();
6879        let qd_num = qreg_to_num(qd);
6880        let qn_num = qreg_to_num(qn);
6881        let qm_num = qreg_to_num(qm);
6882
6883        // For each lane 0..3: use S-registers directly (Q aliasing)
6884        for i in 0..4u32 {
6885            let sd = qd_num * 4 + i;
6886            let sn = qn_num * 4 + i;
6887            let sm = qm_num * 4 + i;
6888
6889            let (vd, d) = encode_sreg(sd);
6890            let (vn, n) = encode_sreg(sn);
6891            let (vm, m) = encode_sreg(sm);
6892
6893            let instr = vfp_base | (d << 22) | (vn << 16) | (vd << 12) | (n << 7) | (m << 5) | vm;
6894            result.extend_from_slice(&vfp_to_thumb_bytes(instr));
6895        }
6896
6897        Ok(result)
6898    }
6899
6900    /// Encode lane-wise f32 VSQRT via S-register extraction
6901    fn encode_thumb_mve_lane_wise_f32_sqrt(&self, qd: &QReg, qm: &QReg) -> Result<Vec<u8>> {
6902        let mut result = Vec::new();
6903        let qd_num = qreg_to_num(qd);
6904        let qm_num = qreg_to_num(qm);
6905
6906        // VSQRT.F32 base: 0xEEB10AC0
6907        for i in 0..4u32 {
6908            let sd = qd_num * 4 + i;
6909            let sm = qm_num * 4 + i;
6910
6911            let (vd, d) = encode_sreg(sd);
6912            let (vm, m) = encode_sreg(sm);
6913
6914            let instr: u32 = 0xEEB10AC0 | (d << 22) | (vd << 12) | (m << 5) | vm;
6915            result.extend_from_slice(&vfp_to_thumb_bytes(instr));
6916        }
6917
6918        Ok(result)
6919    }
6920}
6921
6922#[cfg(test)]
6923mod tests {
6924    use super::*;
6925
6926    #[test]
6927    fn test_encoder_creation() {
6928        let encoder_arm = ArmEncoder::new_arm32();
6929        assert!(!encoder_arm.thumb_mode);
6930
6931        let encoder_thumb = ArmEncoder::new_thumb2();
6932        assert!(encoder_thumb.thumb_mode);
6933    }
6934
6935    /// #204 WAKE-path regression: `SetCond` materialized 0/1 with the 16-bit
6936    /// `MOVS Rd,#imm` (T1), whose Rd field is 3 bits (R0–R7). For a high Rd
6937    /// (R8–R12) `rd_bits << 8` overflows bit 11, flipping the opcode MOVS→CMP
6938    /// (`0x2c00`), so the boolean was never written — gale's `has_waiter` kept a
6939    /// stale value and the binary-sem WAKE dispatch read garbage. High Rd must
6940    /// use the 32-bit `MOV.W` (T2). Verify the bytes, not the IR.
6941    #[test]
6942    fn test_encode_setcond_high_reg_uses_mov_w_204() {
6943        use synth_synthesis::{ArmOp, Condition, Reg};
6944        let enc = ArmEncoder::new_thumb2();
6945        // R12 (high): must be ITE + MOV.W #1 + MOV.W #0, never a 16-bit MOVS/CMP.
6946        let hi = enc
6947            .encode(&ArmOp::SetCond {
6948                rd: Reg::R12,
6949                cond: Condition::NE,
6950            })
6951            .unwrap();
6952        assert_eq!(hi.len(), 10, "ITE(2) + MOV.W(4) + MOV.W(4): {hi:02x?}");
6953        // both value halfwords are MOV.W (0xF04F) — NOT the corrupt CMP (0x2c..).
6954        assert_eq!(&hi[2..4], &[0x4F, 0xF0], "then = MOV.W: {hi:02x?}");
6955        assert_eq!(&hi[6..8], &[0x4F, 0xF0], "else = MOV.W: {hi:02x?}");
6956        assert_eq!(hi[4] & 0x0F, 0x01, "then imm = #1");
6957        assert_eq!(hi[8] & 0x0F, 0x00, "else imm = #0");
6958        // Low Rd keeps the compact 16-bit MOVS form.
6959        let lo = enc
6960            .encode(&ArmOp::SetCond {
6961                rd: Reg::R0,
6962                cond: Condition::NE,
6963            })
6964            .unwrap();
6965        assert_eq!(lo.len(), 6, "ITE(2) + MOVS(2) + MOVS(2): {lo:02x?}");
6966        assert_eq!(lo[2..4], [0x01, 0x20], "then = MOVS R0,#1");
6967        assert_eq!(lo[4..6], [0x00, 0x20], "else = MOVS R0,#0");
6968    }
6969
6970    /// #178/#180 regression: the Thumb `Add`/`Adds`/`Subs` reg-forms used the
6971    /// 16-bit encoding unconditionally. For high registers (R12 base scratch,
6972    /// R8-R11 i64 pairs) the 3-bit register fields overflow and corrupt the
6973    /// operands — `add ip,ip,r0` came out as `adds r4,r5,r1` (0x186C), silently
6974    /// dropping the address operand and miscompiling every optimized memory
6975    /// access. High registers must use the 32-bit `.W` forms.
6976    #[test]
6977    fn test_encode_thumb_add_high_reg_uses_add_w_178_180() {
6978        let encoder = ArmEncoder::new_thumb2();
6979
6980        // add ip, ip, r0  — the exact MemLoad/MemStore base+addr op.
6981        let code = encoder
6982            .encode(&ArmOp::Add {
6983                rd: Reg::R12,
6984                rn: Reg::R12,
6985                op2: Operand2::Reg(Reg::R0),
6986            })
6987            .unwrap();
6988        // ADD.W ip, ip, r0 = EB0C 0C00 (little-endian halfwords).
6989        assert_eq!(
6990            code,
6991            vec![0x0C, 0xEB, 0x00, 0x0C],
6992            "high-reg Thumb ADD must be 32-bit ADD.W (EB0C 0C00), not corrupt 16-bit; got {code:02X?}"
6993        );
6994        // Must NOT be the buggy 16-bit 0x186C (`adds r4,r5,r1`).
6995        assert_ne!(code, vec![0x6C, 0x18], "regressed to corrupt 16-bit ADDS");
6996
6997        // Low-register add stays 16-bit (no regression for the common case).
6998        let lo = encoder
6999            .encode(&ArmOp::Add {
7000                rd: Reg::R1,
7001                rn: Reg::R2,
7002                op2: Operand2::Reg(Reg::R3),
7003            })
7004            .unwrap();
7005        assert_eq!(
7006            lo.len(),
7007            2,
7008            "low-reg ADD should remain 16-bit, got {lo:02X?}"
7009        );
7010    }
7011
7012    /// #178/#180 sibling: i64 low-word `Adds`/`Subs` can land in R8-R11 pairs;
7013    /// those must fall back to 32-bit ADDS.W/SUBS.W (flag-setting preserved).
7014    #[test]
7015    fn test_encode_thumb_adds_subs_high_reg_use_32bit_178_180() {
7016        let encoder = ArmEncoder::new_thumb2();
7017
7018        // adds r10, r10, r8  → ADDS.W = EB1A 0A08
7019        let adds = encoder
7020            .encode(&ArmOp::Adds {
7021                rd: Reg::R10,
7022                rn: Reg::R10,
7023                op2: Operand2::Reg(Reg::R8),
7024            })
7025            .unwrap();
7026        assert_eq!(
7027            adds,
7028            vec![0x1A, 0xEB, 0x08, 0x0A],
7029            "high-reg ADDS must be 32-bit ADDS.W (EB1A 0A08); got {adds:02X?}"
7030        );
7031
7032        // subs r10, r10, r8  → SUBS.W = EBBA 0A08
7033        let subs = encoder
7034            .encode(&ArmOp::Subs {
7035                rd: Reg::R10,
7036                rn: Reg::R10,
7037                op2: Operand2::Reg(Reg::R8),
7038            })
7039            .unwrap();
7040        assert_eq!(
7041            subs,
7042            vec![0xBA, 0xEB, 0x08, 0x0A],
7043            "high-reg SUBS must be 32-bit SUBS.W (EBBA 0A08); got {subs:02X?}"
7044        );
7045    }
7046
7047    /// #184 (sibling of #180): 16-bit CMN (T1) only encodes R0-R7. High registers
7048    /// must use 32-bit CMN.W, not the corrupt truncated 16-bit form.
7049    #[test]
7050    fn test_encode_thumb_cmn_high_reg_uses_cmn_w_184() {
7051        let encoder = ArmEncoder::new_thumb2();
7052
7053        // cmn r10, r8  → CMN.W = EB1A 0F08 (ADD.W S=1, Rd=PC discarded).
7054        let cmn = encoder
7055            .encode(&ArmOp::Cmn {
7056                rn: Reg::R10,
7057                op2: Operand2::Reg(Reg::R8),
7058            })
7059            .unwrap();
7060        assert_eq!(
7061            cmn,
7062            vec![0x1A, 0xEB, 0x08, 0x0F],
7063            "high-reg CMN must be 32-bit CMN.W (EB1A 0F08); got {cmn:02X?}"
7064        );
7065
7066        // Low registers stay 16-bit: cmn r1, r2 = 0x42D1.
7067        let lo = encoder
7068            .encode(&ArmOp::Cmn {
7069                rn: Reg::R1,
7070                op2: Operand2::Reg(Reg::R2),
7071            })
7072            .unwrap();
7073        assert_eq!(
7074            lo.len(),
7075            2,
7076            "low-reg CMN should remain 16-bit, got {lo:02X?}"
7077        );
7078        assert_eq!(lo, vec![0xD1, 0x42], "low-reg CMN bytes wrong: {lo:02X?}");
7079    }
7080
7081    /// #185 regression: feeding PC (R15) as a data operand to a Thumb-2 op that
7082    /// guards its registers must return Err, not panic under debug-assertions.
7083    /// (Synth never emits PC here; the fuzz harness requires encode() be total.)
7084    #[test]
7085    fn test_encode_pc_operand_returns_err_not_panic_185() {
7086        let encoder = ArmEncoder::new_thumb2();
7087        for op in [
7088            ArmOp::Sdiv {
7089                rd: Reg::PC,
7090                rn: Reg::R0,
7091                rm: Reg::R1,
7092            },
7093            ArmOp::Udiv {
7094                rd: Reg::R0,
7095                rn: Reg::PC,
7096                rm: Reg::R1,
7097            },
7098            ArmOp::Sdiv {
7099                rd: Reg::R0,
7100                rn: Reg::R1,
7101                rm: Reg::PC,
7102            },
7103        ] {
7104            let r = encoder.encode(&op);
7105            assert!(
7106                r.is_err(),
7107                "encode({op:?}) must return Err for a PC operand, got {r:?}"
7108            );
7109        }
7110        // Valid registers still encode fine (no false rejection).
7111        assert!(
7112            encoder
7113                .encode(&ArmOp::Sdiv {
7114                    rd: Reg::R0,
7115                    rn: Reg::R1,
7116                    rm: Reg::R2
7117                })
7118                .is_ok()
7119        );
7120    }
7121
7122    #[test]
7123    fn test_encode_nop_arm32() {
7124        let encoder = ArmEncoder::new_arm32();
7125        let code = encoder.encode(&ArmOp::Nop).unwrap();
7126
7127        assert_eq!(code.len(), 4); // ARM32 instructions are 4 bytes
7128        assert_eq!(code, vec![0x00, 0x00, 0xA0, 0xE1]); // MOV R0, R0
7129    }
7130
7131    #[test]
7132    fn test_encode_nop_thumb() {
7133        let encoder = ArmEncoder::new_thumb2();
7134        let code = encoder.encode(&ArmOp::Nop).unwrap();
7135
7136        assert_eq!(code.len(), 2); // Thumb instructions are 2 bytes
7137        assert_eq!(code, vec![0x00, 0xBF]); // NOP
7138    }
7139
7140    #[test]
7141    fn test_encode_mov_immediate_arm32() {
7142        let encoder = ArmEncoder::new_arm32();
7143        let op = ArmOp::Mov {
7144            rd: Reg::R0,
7145            op2: Operand2::Imm(42),
7146        };
7147
7148        let code = encoder.encode(&op).unwrap();
7149        assert_eq!(code.len(), 4);
7150
7151        // Verify it's a MOV instruction (bits should have immediate flag set)
7152        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7153        assert_eq!(instr & 0x0E000000, 0x02000000); // Check I bit is set
7154    }
7155
7156    #[test]
7157    fn test_encode_add_registers_arm32() {
7158        let encoder = ArmEncoder::new_arm32();
7159        let op = ArmOp::Add {
7160            rd: Reg::R0,
7161            rn: Reg::R1,
7162            op2: Operand2::Reg(Reg::R2),
7163        };
7164
7165        let code = encoder.encode(&op).unwrap();
7166        assert_eq!(code.len(), 4);
7167
7168        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7169        // Verify it's an ADD instruction with correct opcode
7170        assert_eq!(instr & 0x0FE00000, 0x00800000);
7171    }
7172
7173    #[test]
7174    fn test_encode_ldr_arm32() {
7175        let encoder = ArmEncoder::new_arm32();
7176        let op = ArmOp::Ldr {
7177            rd: Reg::R0,
7178            addr: MemAddr::imm(Reg::R1, 4),
7179        };
7180
7181        let code = encoder.encode(&op).unwrap();
7182        assert_eq!(code.len(), 4);
7183
7184        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7185        // Verify load bit is set
7186        assert_eq!(instr & 0x00100000, 0x00100000);
7187    }
7188
7189    #[test]
7190    fn test_encode_str_arm32() {
7191        let encoder = ArmEncoder::new_arm32();
7192        let op = ArmOp::Str {
7193            rd: Reg::R0,
7194            addr: MemAddr::imm(Reg::SP, 0),
7195        };
7196
7197        let code = encoder.encode(&op).unwrap();
7198        assert_eq!(code.len(), 4);
7199    }
7200
7201    #[test]
7202    fn test_encode_branch_arm32() {
7203        let encoder = ArmEncoder::new_arm32();
7204        let op = ArmOp::Bl {
7205            label: "main".to_string(),
7206        };
7207
7208        let code = encoder.encode(&op).unwrap();
7209        assert_eq!(code.len(), 4);
7210
7211        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7212        // Verify BL opcode
7213        assert_eq!(instr & 0x0F000000, 0x0B000000);
7214    }
7215
7216    /// Regression test for #167 + #174: the Thumb-2 BL relocatable placeholder
7217    /// must carry a -4 addend so an R_ARM_THM_CALL nets to exactly the symbol S.
7218    /// The correct encoding is what `gas` emits for `bl <extern>`: f7ff fffe
7219    /// (hw1=0xF7FF, hw2=0xFFFE), little-endian bytes FF F7 FE FF.
7220    ///   - 0xD000 (J1=J2=0) → ~+0x600000 garbage addend: `bl c0000c` / truncated
7221    ///     to fit (#167).
7222    ///   - 0xF800 (addend 0) → lands at S+4, one instruction past the callee
7223    ///     entry (#174).
7224    ///   - 0xFFFE (addend -4) → lands at S. Correct.
7225    #[test]
7226    fn test_encode_thumb_bl_placeholder_addend_167_174() {
7227        let encoder = ArmEncoder::new_thumb2();
7228        let op = ArmOp::Bl {
7229            label: "callee".to_string(),
7230        };
7231
7232        let code = encoder.encode(&op).unwrap();
7233        assert_eq!(code.len(), 4, "Thumb-2 BL is 32-bit");
7234
7235        let hw1 = u16::from_le_bytes([code[0], code[1]]);
7236        let hw2 = u16::from_le_bytes([code[2], code[3]]);
7237        assert_eq!(hw1, 0xF7FF, "BL first halfword (matches gas `bl <extern>`)");
7238        assert_eq!(
7239            hw2, 0xFFFE,
7240            "BL second halfword must be 0xFFFE (-4 addend → nets to S), not 0xF800 (→ S+4, #174) or 0xD000 (#167)"
7241        );
7242        assert_ne!(hw2, 0xF800, "0xF800 (addend 0) lands at S+4 (#174)");
7243        assert_ne!(hw2, 0xD000, "0xD000 bakes in a ~+0x600000 addend (#167)");
7244    }
7245
7246    #[test]
7247    fn test_encode_sequence() {
7248        let encoder = ArmEncoder::new_arm32();
7249        let ops = vec![
7250            ArmOp::Mov {
7251                rd: Reg::R0,
7252                op2: Operand2::Imm(42),
7253            },
7254            ArmOp::Mov {
7255                rd: Reg::R1,
7256                op2: Operand2::Imm(10),
7257            },
7258            ArmOp::Add {
7259                rd: Reg::R2,
7260                rn: Reg::R0,
7261                op2: Operand2::Reg(Reg::R1),
7262            },
7263        ];
7264
7265        let code = encoder.encode_sequence(&ops).unwrap();
7266        assert_eq!(code.len(), 12); // 3 instructions * 4 bytes
7267    }
7268
7269    #[test]
7270    fn test_reg_to_bits() {
7271        assert_eq!(reg_to_bits(&Reg::R0), 0);
7272        assert_eq!(reg_to_bits(&Reg::R7), 7);
7273        assert_eq!(reg_to_bits(&Reg::SP), 13);
7274        assert_eq!(reg_to_bits(&Reg::LR), 14);
7275        assert_eq!(reg_to_bits(&Reg::PC), 15);
7276    }
7277
7278    #[test]
7279    fn test_encode_bitwise_operations() {
7280        let encoder = ArmEncoder::new_arm32();
7281
7282        let and_op = ArmOp::And {
7283            rd: Reg::R0,
7284            rn: Reg::R1,
7285            op2: Operand2::Reg(Reg::R2),
7286        };
7287        let and_code = encoder.encode(&and_op).unwrap();
7288        assert_eq!(and_code.len(), 4);
7289
7290        let orr_op = ArmOp::Orr {
7291            rd: Reg::R0,
7292            rn: Reg::R1,
7293            op2: Operand2::Reg(Reg::R2),
7294        };
7295        let orr_code = encoder.encode(&orr_op).unwrap();
7296        assert_eq!(orr_code.len(), 4);
7297
7298        let eor_op = ArmOp::Eor {
7299            rd: Reg::R0,
7300            rn: Reg::R1,
7301            op2: Operand2::Reg(Reg::R2),
7302        };
7303        let eor_code = encoder.encode(&eor_op).unwrap();
7304        assert_eq!(eor_code.len(), 4);
7305    }
7306
7307    // === Thumb-2 32-bit encoding tests ===
7308
7309    #[test]
7310    fn test_encode_sdiv_thumb2() {
7311        let encoder = ArmEncoder::new_thumb2();
7312        let op = ArmOp::Sdiv {
7313            rd: Reg::R0,
7314            rn: Reg::R1,
7315            rm: Reg::R2,
7316        };
7317
7318        let code = encoder.encode(&op).unwrap();
7319        assert_eq!(code.len(), 4); // 32-bit Thumb-2 instruction
7320
7321        // SDIV R0, R1, R2: 0xFB91 0xF0F2
7322        // First halfword: 0xFB90 | Rn(1) = 0xFB91
7323        // Second halfword: 0xF0F0 | Rd(0)<<8 | Rm(2) = 0xF0F2
7324        // Little-endian: [0x91, 0xFB, 0xF2, 0xF0]
7325        assert_eq!(code[0], 0x91);
7326        assert_eq!(code[1], 0xFB);
7327        assert_eq!(code[2], 0xF2);
7328        assert_eq!(code[3], 0xF0);
7329    }
7330
7331    #[test]
7332    fn test_encode_udiv_thumb2() {
7333        let encoder = ArmEncoder::new_thumb2();
7334        let op = ArmOp::Udiv {
7335            rd: Reg::R0,
7336            rn: Reg::R1,
7337            rm: Reg::R2,
7338        };
7339
7340        let code = encoder.encode(&op).unwrap();
7341        assert_eq!(code.len(), 4); // 32-bit Thumb-2 instruction
7342
7343        // UDIV R0, R1, R2: 0xFBB1 0xF0F2
7344        // Little-endian: [0xB1, 0xFB, 0xF2, 0xF0]
7345        assert_eq!(code[0], 0xB1);
7346        assert_eq!(code[1], 0xFB);
7347        assert_eq!(code[2], 0xF2);
7348        assert_eq!(code[3], 0xF0);
7349    }
7350
7351    #[test]
7352    fn test_encode_mul_thumb2() {
7353        let encoder = ArmEncoder::new_thumb2();
7354        let op = ArmOp::Mul {
7355            rd: Reg::R0,
7356            rn: Reg::R1,
7357            rm: Reg::R2,
7358        };
7359
7360        let code = encoder.encode(&op).unwrap();
7361        assert_eq!(code.len(), 4); // 32-bit Thumb-2 instruction
7362    }
7363
7364    #[test]
7365    fn test_encode_and_thumb2() {
7366        let encoder = ArmEncoder::new_thumb2();
7367        let op = ArmOp::And {
7368            rd: Reg::R0,
7369            rn: Reg::R1,
7370            op2: Operand2::Reg(Reg::R2),
7371        };
7372
7373        let code = encoder.encode(&op).unwrap();
7374        assert_eq!(code.len(), 4); // 32-bit Thumb-2 instruction
7375    }
7376
7377    #[test]
7378    fn test_encode_lsl_thumb2_low_regs() {
7379        let encoder = ArmEncoder::new_thumb2();
7380        let op = ArmOp::Lsl {
7381            rd: Reg::R0,
7382            rn: Reg::R1,
7383            shift: 5,
7384        };
7385
7386        let code = encoder.encode(&op).unwrap();
7387        assert_eq!(code.len(), 2); // 16-bit for low registers
7388    }
7389
7390    #[test]
7391    fn test_encode_clz_thumb2() {
7392        let encoder = ArmEncoder::new_thumb2();
7393        let op = ArmOp::Clz {
7394            rd: Reg::R0,
7395            rm: Reg::R1,
7396        };
7397
7398        let code = encoder.encode(&op).unwrap();
7399        assert_eq!(code.len(), 4); // 32-bit Thumb-2 instruction
7400    }
7401
7402    #[test]
7403    fn test_encode_bx_thumb2() {
7404        let encoder = ArmEncoder::new_thumb2();
7405        let op = ArmOp::Bx { rm: Reg::LR };
7406
7407        let code = encoder.encode(&op).unwrap();
7408        assert_eq!(code.len(), 2); // 16-bit instruction
7409
7410        // BX LR: 0x4770
7411        assert_eq!(code, vec![0x70, 0x47]);
7412    }
7413
7414    // ========================================================================
7415    // f32 pseudo-op encoding tests
7416    // ========================================================================
7417
7418    #[test]
7419    fn test_encode_f32_abs_arm32() {
7420        let encoder = ArmEncoder::new_arm32();
7421        let op = ArmOp::F32Abs {
7422            sd: VfpReg::S0,
7423            sm: VfpReg::S2,
7424        };
7425        let code = encoder.encode(&op).unwrap();
7426        assert_eq!(code.len(), 4); // Single VFP instruction
7427    }
7428
7429    #[test]
7430    fn test_encode_f32_neg_arm32() {
7431        let encoder = ArmEncoder::new_arm32();
7432        let op = ArmOp::F32Neg {
7433            sd: VfpReg::S0,
7434            sm: VfpReg::S2,
7435        };
7436        let code = encoder.encode(&op).unwrap();
7437        assert_eq!(code.len(), 4);
7438    }
7439
7440    #[test]
7441    fn test_encode_f32_sqrt_arm32() {
7442        let encoder = ArmEncoder::new_arm32();
7443        let op = ArmOp::F32Sqrt {
7444            sd: VfpReg::S0,
7445            sm: VfpReg::S2,
7446        };
7447        let code = encoder.encode(&op).unwrap();
7448        assert_eq!(code.len(), 4);
7449    }
7450
7451    #[test]
7452    fn test_encode_f32_ceil_arm32() {
7453        let encoder = ArmEncoder::new_arm32();
7454        let op = ArmOp::F32Ceil {
7455            sd: VfpReg::S0,
7456            sm: VfpReg::S2,
7457        };
7458        let code = encoder.encode(&op).unwrap();
7459        // VMRS + BIC + ORR + VMSR + VCVT.S32.F32 + VMRS + BIC + VMSR + VCVT.F32.S32
7460        assert_eq!(code.len(), 36);
7461    }
7462
7463    #[test]
7464    fn test_encode_f32_floor_thumb2() {
7465        let encoder = ArmEncoder::new_thumb2();
7466        let op = ArmOp::F32Floor {
7467            sd: VfpReg::S0,
7468            sm: VfpReg::S2,
7469        };
7470        let code = encoder.encode(&op).unwrap();
7471        // VMRS + BIC.W + ORR.W + VMSR + VCVT + VMRS + BIC.W + VMSR + VCVT.F32.S32
7472        assert_eq!(code.len(), 36);
7473    }
7474
7475    #[test]
7476    fn test_encode_f32_min_arm32() {
7477        let encoder = ArmEncoder::new_arm32();
7478        let op = ArmOp::F32Min {
7479            sd: VfpReg::S0,
7480            sn: VfpReg::S2,
7481            sm: VfpReg::S4,
7482        };
7483        let code = encoder.encode(&op).unwrap();
7484        assert_eq!(code.len(), 16); // VMOV + VCMP + VMRS + conditional VMOV
7485    }
7486
7487    #[test]
7488    fn test_encode_f32_max_thumb2() {
7489        let encoder = ArmEncoder::new_thumb2();
7490        let op = ArmOp::F32Max {
7491            sd: VfpReg::S0,
7492            sn: VfpReg::S2,
7493            sm: VfpReg::S4,
7494        };
7495        let code = encoder.encode(&op).unwrap();
7496        // VMOV(4) + VCMP(4) + VMRS(4) + IT(2) + VMOV(4) = 18
7497        assert_eq!(code.len(), 18);
7498    }
7499
7500    #[test]
7501    fn test_encode_f32_copysign_arm32() {
7502        let encoder = ArmEncoder::new_arm32();
7503        let op = ArmOp::F32Copysign {
7504            sd: VfpReg::S0,
7505            sn: VfpReg::S2,
7506            sm: VfpReg::S4,
7507        };
7508        let code = encoder.encode(&op).unwrap();
7509        // VMOV + VMOV + AND + BIC + ORR + VMOV = 6 * 4 = 24
7510        assert_eq!(code.len(), 24);
7511    }
7512
7513    // ========================================================================
7514    // f64 encoding tests
7515    // ========================================================================
7516
7517    #[test]
7518    fn test_encode_f64_add_arm32() {
7519        let encoder = ArmEncoder::new_arm32();
7520        let op = ArmOp::F64Add {
7521            dd: VfpReg::D0,
7522            dn: VfpReg::D1,
7523            dm: VfpReg::D2,
7524        };
7525        let code = encoder.encode(&op).unwrap();
7526        assert_eq!(code.len(), 4);
7527        // VADD.F64 D0, D1, D2: check coprocessor is cp11 (0xB)
7528        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7529        assert_eq!((instr >> 8) & 0xF, 0xB); // cp11
7530    }
7531
7532    #[test]
7533    fn test_encode_f64_sub_thumb2() {
7534        let encoder = ArmEncoder::new_thumb2();
7535        let op = ArmOp::F64Sub {
7536            dd: VfpReg::D0,
7537            dn: VfpReg::D1,
7538            dm: VfpReg::D2,
7539        };
7540        let code = encoder.encode(&op).unwrap();
7541        assert_eq!(code.len(), 4); // 32-bit VFP as two Thumb halfwords
7542    }
7543
7544    #[test]
7545    fn test_encode_f64_mul_arm32() {
7546        let encoder = ArmEncoder::new_arm32();
7547        let op = ArmOp::F64Mul {
7548            dd: VfpReg::D0,
7549            dn: VfpReg::D1,
7550            dm: VfpReg::D2,
7551        };
7552        let code = encoder.encode(&op).unwrap();
7553        assert_eq!(code.len(), 4);
7554    }
7555
7556    #[test]
7557    fn test_encode_f64_div_arm32() {
7558        let encoder = ArmEncoder::new_arm32();
7559        let op = ArmOp::F64Div {
7560            dd: VfpReg::D0,
7561            dn: VfpReg::D1,
7562            dm: VfpReg::D2,
7563        };
7564        let code = encoder.encode(&op).unwrap();
7565        assert_eq!(code.len(), 4);
7566    }
7567
7568    #[test]
7569    fn test_encode_f64_abs_arm32() {
7570        let encoder = ArmEncoder::new_arm32();
7571        let op = ArmOp::F64Abs {
7572            dd: VfpReg::D0,
7573            dm: VfpReg::D2,
7574        };
7575        let code = encoder.encode(&op).unwrap();
7576        assert_eq!(code.len(), 4);
7577    }
7578
7579    #[test]
7580    fn test_encode_f64_neg_arm32() {
7581        let encoder = ArmEncoder::new_arm32();
7582        let op = ArmOp::F64Neg {
7583            dd: VfpReg::D0,
7584            dm: VfpReg::D2,
7585        };
7586        let code = encoder.encode(&op).unwrap();
7587        assert_eq!(code.len(), 4);
7588    }
7589
7590    #[test]
7591    fn test_encode_f64_sqrt_arm32() {
7592        let encoder = ArmEncoder::new_arm32();
7593        let op = ArmOp::F64Sqrt {
7594            dd: VfpReg::D0,
7595            dm: VfpReg::D2,
7596        };
7597        let code = encoder.encode(&op).unwrap();
7598        assert_eq!(code.len(), 4);
7599    }
7600
7601    #[test]
7602    fn test_encode_f64_load_arm32() {
7603        let encoder = ArmEncoder::new_arm32();
7604        let op = ArmOp::F64Load {
7605            dd: VfpReg::D0,
7606            addr: MemAddr::imm(Reg::R0, 8),
7607        };
7608        let code = encoder.encode(&op).unwrap();
7609        assert_eq!(code.len(), 4);
7610        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7611        assert_eq!((instr >> 8) & 0xF, 0xB); // cp11 for F64
7612        assert_eq!(instr & 0xFF, 2); // offset 8 / 4 = 2
7613    }
7614
7615    #[test]
7616    fn test_encode_f64_store_thumb2() {
7617        let encoder = ArmEncoder::new_thumb2();
7618        let op = ArmOp::F64Store {
7619            dd: VfpReg::D0,
7620            addr: MemAddr::imm(Reg::SP, 0),
7621        };
7622        let code = encoder.encode(&op).unwrap();
7623        assert_eq!(code.len(), 4);
7624    }
7625
7626    #[test]
7627    fn test_encode_f64_compare_arm32() {
7628        let encoder = ArmEncoder::new_arm32();
7629        let op = ArmOp::F64Eq {
7630            rd: Reg::R0,
7631            dn: VfpReg::D0,
7632            dm: VfpReg::D1,
7633        };
7634        let code = encoder.encode(&op).unwrap();
7635        assert_eq!(code.len(), 16); // VCMP + VMRS + MOV #0 + MOVcond #1
7636    }
7637
7638    #[test]
7639    fn test_encode_f64_compare_thumb2() {
7640        let encoder = ArmEncoder::new_thumb2();
7641        let op = ArmOp::F64Lt {
7642            rd: Reg::R0,
7643            dn: VfpReg::D0,
7644            dm: VfpReg::D1,
7645        };
7646        let code = encoder.encode(&op).unwrap();
7647        // VCMP(4) + VMRS(4) + MOVS(2) + IT(2) + MOV(2) = 14
7648        assert_eq!(code.len(), 14);
7649    }
7650
7651    #[test]
7652    fn test_encode_f64_const_arm32() {
7653        let encoder = ArmEncoder::new_arm32();
7654        let op = ArmOp::F64Const {
7655            dd: VfpReg::D0,
7656            value: 3.125,
7657        };
7658        let code = encoder.encode(&op).unwrap();
7659        // MOVW(4) + MOVT(4) + MOVW(4) + MOVT(4) + VMOV(4) = 20
7660        assert_eq!(code.len(), 20);
7661    }
7662
7663    #[test]
7664    fn test_encode_f64_const_thumb2() {
7665        let encoder = ArmEncoder::new_thumb2();
7666        let op = ArmOp::F64Const {
7667            dd: VfpReg::D0,
7668            value: 2.5,
7669        };
7670        let code = encoder.encode(&op).unwrap();
7671        // MOVW(4) + MOVT(4) + MOVW(4) + MOVT(4) + VMOV(4) = 20
7672        assert_eq!(code.len(), 20);
7673    }
7674
7675    #[test]
7676    fn test_encode_f64_convert_i32s_arm32() {
7677        let encoder = ArmEncoder::new_arm32();
7678        let op = ArmOp::F64ConvertI32S {
7679            dd: VfpReg::D0,
7680            rm: Reg::R0,
7681        };
7682        let code = encoder.encode(&op).unwrap();
7683        // VMOV(4) + VCVT(4) = 8
7684        assert_eq!(code.len(), 8);
7685    }
7686
7687    #[test]
7688    fn test_encode_f64_promote_f32_arm32() {
7689        let encoder = ArmEncoder::new_arm32();
7690        let op = ArmOp::F64PromoteF32 {
7691            dd: VfpReg::D0,
7692            sm: VfpReg::S0,
7693        };
7694        let code = encoder.encode(&op).unwrap();
7695        assert_eq!(code.len(), 4); // Single VCVT.F64.F32 instruction
7696    }
7697
7698    #[test]
7699    fn test_encode_f64_promote_f32_thumb2() {
7700        let encoder = ArmEncoder::new_thumb2();
7701        let op = ArmOp::F64PromoteF32 {
7702            dd: VfpReg::D0,
7703            sm: VfpReg::S0,
7704        };
7705        let code = encoder.encode(&op).unwrap();
7706        assert_eq!(code.len(), 4);
7707    }
7708
7709    #[test]
7710    fn test_encode_i32_trunc_f64s_arm32() {
7711        let encoder = ArmEncoder::new_arm32();
7712        let op = ArmOp::I32TruncF64S {
7713            rd: Reg::R0,
7714            dm: VfpReg::D0,
7715        };
7716        let code = encoder.encode(&op).unwrap();
7717        // VCVT(4) + VMOV(4) = 8
7718        assert_eq!(code.len(), 8);
7719    }
7720
7721    #[test]
7722    fn test_encode_f64_reinterpret_i64_arm32() {
7723        let encoder = ArmEncoder::new_arm32();
7724        let op = ArmOp::F64ReinterpretI64 {
7725            dd: VfpReg::D0,
7726            rmlo: Reg::R0,
7727            rmhi: Reg::R1,
7728        };
7729        let code = encoder.encode(&op).unwrap();
7730        assert_eq!(code.len(), 4); // Single VMOV instruction
7731    }
7732
7733    #[test]
7734    fn test_encode_i64_reinterpret_f64_thumb2() {
7735        let encoder = ArmEncoder::new_thumb2();
7736        let op = ArmOp::I64ReinterpretF64 {
7737            rdlo: Reg::R0,
7738            rdhi: Reg::R1,
7739            dm: VfpReg::D0,
7740        };
7741        let code = encoder.encode(&op).unwrap();
7742        assert_eq!(code.len(), 4);
7743    }
7744
7745    #[test]
7746    fn test_encode_f64_trunc_thumb2() {
7747        let encoder = ArmEncoder::new_thumb2();
7748        let op = ArmOp::F64Trunc {
7749            dd: VfpReg::D0,
7750            dm: VfpReg::D1,
7751        };
7752        let code = encoder.encode(&op).unwrap();
7753        // Two VFP instructions via Thumb encoding
7754        assert_eq!(code.len(), 8);
7755    }
7756
7757    #[test]
7758    fn test_encode_f64_min_arm32() {
7759        let encoder = ArmEncoder::new_arm32();
7760        let op = ArmOp::F64Min {
7761            dd: VfpReg::D0,
7762            dn: VfpReg::D1,
7763            dm: VfpReg::D2,
7764        };
7765        let code = encoder.encode(&op).unwrap();
7766        // VMOV + VCMP + VMRS + conditional VMOV = 16
7767        assert_eq!(code.len(), 16);
7768    }
7769
7770    #[test]
7771    fn test_f64_cp11_encoding() {
7772        // Verify that F64 instructions use coprocessor 11 (0xB), not 10 (0xA)
7773        let encoder = ArmEncoder::new_arm32();
7774
7775        // F64Add
7776        let code = encoder
7777            .encode(&ArmOp::F64Add {
7778                dd: VfpReg::D0,
7779                dn: VfpReg::D0,
7780                dm: VfpReg::D0,
7781            })
7782            .unwrap();
7783        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7784        assert_eq!((instr >> 8) & 0xF, 0xB, "F64 should use cp11");
7785
7786        // F32Add for comparison
7787        let code = encoder
7788            .encode(&ArmOp::F32Add {
7789                sd: VfpReg::S0,
7790                sn: VfpReg::S0,
7791                sm: VfpReg::S0,
7792            })
7793            .unwrap();
7794        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7795        assert_eq!((instr >> 8) & 0xF, 0xA, "F32 should use cp10");
7796    }
7797
7798    #[test]
7799    fn test_dreg_encoding_higher_registers() {
7800        let encoder = ArmEncoder::new_arm32();
7801
7802        // Test with D15 (highest register)
7803        let op = ArmOp::F64Add {
7804            dd: VfpReg::D15,
7805            dn: VfpReg::D14,
7806            dm: VfpReg::D13,
7807        };
7808        let code = encoder.encode(&op).unwrap();
7809        assert_eq!(code.len(), 4);
7810
7811        // Verify the register encoding worked (instruction is valid)
7812        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7813        assert_eq!((instr >> 8) & 0xF, 0xB); // cp11
7814    }
7815
7816    // ========================================================================
7817    // Control flow encoding tests
7818    // ========================================================================
7819
7820    #[test]
7821    fn test_encode_label_emits_no_bytes() {
7822        let encoder = ArmEncoder::new_thumb2();
7823        let op = ArmOp::Label {
7824            name: ".Lblock_end_0".to_string(),
7825        };
7826        let code = encoder.encode(&op).unwrap();
7827        assert!(code.is_empty(), "Label should emit zero bytes");
7828
7829        let encoder32 = ArmEncoder::new_arm32();
7830        let code32 = encoder32.encode(&op).unwrap();
7831        assert!(
7832            code32.is_empty(),
7833            "Label should emit zero bytes in ARM32 too"
7834        );
7835    }
7836
7837    #[test]
7838    fn test_encode_bcc_eq_thumb2() {
7839        use synth_synthesis::Condition;
7840        let encoder = ArmEncoder::new_thumb2();
7841        let op = ArmOp::Bcc {
7842            cond: Condition::EQ,
7843            label: "target".to_string(),
7844        };
7845        let code = encoder.encode(&op).unwrap();
7846        assert_eq!(code.len(), 2); // 16-bit conditional branch
7847
7848        // BEQ with offset 0: 0xD000 in little-endian
7849        assert_eq!(code, vec![0x00, 0xD0]);
7850    }
7851
7852    #[test]
7853    fn test_encode_bcc_ne_thumb2() {
7854        use synth_synthesis::Condition;
7855        let encoder = ArmEncoder::new_thumb2();
7856        let op = ArmOp::Bcc {
7857            cond: Condition::NE,
7858            label: "target".to_string(),
7859        };
7860        let code = encoder.encode(&op).unwrap();
7861        assert_eq!(code.len(), 2);
7862
7863        // BNE with offset 0: 0xD100 in little-endian
7864        assert_eq!(code, vec![0x00, 0xD1]);
7865    }
7866
7867    #[test]
7868    fn test_encode_bcc_arm32() {
7869        use synth_synthesis::Condition;
7870        let encoder = ArmEncoder::new_arm32();
7871        let op = ArmOp::Bcc {
7872            cond: Condition::EQ,
7873            label: "target".to_string(),
7874        };
7875        let code = encoder.encode(&op).unwrap();
7876        assert_eq!(code.len(), 4); // 32-bit ARM instruction
7877
7878        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7879        // BEQ: cond=0x0, opcode=0xA, offset=0
7880        assert_eq!(instr & 0xF0000000, 0x00000000); // EQ condition
7881        assert_eq!(instr & 0x0F000000, 0x0A000000); // Branch opcode
7882    }
7883
7884    #[test]
7885    fn test_encode_udf_thumb2() {
7886        let encoder = ArmEncoder::new_thumb2();
7887        let op = ArmOp::Udf { imm: 0 };
7888        let code = encoder.encode(&op).unwrap();
7889        assert_eq!(code.len(), 2); // 16-bit
7890
7891        // UDF #0: 0xDE00 in little-endian
7892        assert_eq!(code, vec![0x00, 0xDE]);
7893    }
7894
7895    #[test]
7896    fn test_encode_nop_thumb2() {
7897        let encoder = ArmEncoder::new_thumb2();
7898        let op = ArmOp::Nop;
7899        let code = encoder.encode(&op).unwrap();
7900        assert_eq!(code.len(), 2); // 16-bit
7901
7902        // NOP: 0xBF00 in little-endian
7903        assert_eq!(code, vec![0x00, 0xBF]);
7904    }
7905
7906    // =========================================================================
7907    // i64 Thumb-2 encoding tests
7908    // =========================================================================
7909
7910    #[test]
7911    fn test_encode_i64_add_thumb2() {
7912        let encoder = ArmEncoder::new_thumb2();
7913        let op = ArmOp::I64Add {
7914            rdlo: Reg::R0,
7915            rdhi: Reg::R1,
7916            rnlo: Reg::R0,
7917            rnhi: Reg::R1,
7918            rmlo: Reg::R2,
7919            rmhi: Reg::R3,
7920        };
7921        let code = encoder.encode(&op).unwrap();
7922        // Should emit ADDS (2 bytes) + ADC.W (4 bytes) = 6 bytes
7923        assert_eq!(code.len(), 6, "I64Add should be 6 bytes (ADDS + ADC.W)");
7924    }
7925
7926    #[test]
7927    fn test_encode_i64_sub_thumb2() {
7928        let encoder = ArmEncoder::new_thumb2();
7929        let op = ArmOp::I64Sub {
7930            rdlo: Reg::R0,
7931            rdhi: Reg::R1,
7932            rnlo: Reg::R0,
7933            rnhi: Reg::R1,
7934            rmlo: Reg::R2,
7935            rmhi: Reg::R3,
7936        };
7937        let code = encoder.encode(&op).unwrap();
7938        // Should emit SUBS (2 bytes) + SBC.W (4 bytes) = 6 bytes
7939        assert_eq!(code.len(), 6, "I64Sub should be 6 bytes (SUBS + SBC.W)");
7940    }
7941
7942    #[test]
7943    fn test_encode_i64_and_thumb2() {
7944        let encoder = ArmEncoder::new_thumb2();
7945        let op = ArmOp::I64And {
7946            rdlo: Reg::R0,
7947            rdhi: Reg::R1,
7948            rnlo: Reg::R0,
7949            rnhi: Reg::R1,
7950            rmlo: Reg::R2,
7951            rmhi: Reg::R3,
7952        };
7953        let code = encoder.encode(&op).unwrap();
7954        // AND.W (4 bytes) + AND.W (4 bytes) = 8 bytes
7955        assert!(code.len() >= 4, "I64And should emit at least 4 bytes");
7956    }
7957
7958    #[test]
7959    fn test_encode_i64_or_thumb2() {
7960        let encoder = ArmEncoder::new_thumb2();
7961        let op = ArmOp::I64Or {
7962            rdlo: Reg::R0,
7963            rdhi: Reg::R1,
7964            rnlo: Reg::R0,
7965            rnhi: Reg::R1,
7966            rmlo: Reg::R2,
7967            rmhi: Reg::R3,
7968        };
7969        let code = encoder.encode(&op).unwrap();
7970        assert!(code.len() >= 4, "I64Or should emit at least 4 bytes");
7971    }
7972
7973    #[test]
7974    fn test_encode_i64_xor_thumb2() {
7975        let encoder = ArmEncoder::new_thumb2();
7976        let op = ArmOp::I64Xor {
7977            rdlo: Reg::R0,
7978            rdhi: Reg::R1,
7979            rnlo: Reg::R0,
7980            rnhi: Reg::R1,
7981            rmlo: Reg::R2,
7982            rmhi: Reg::R3,
7983        };
7984        let code = encoder.encode(&op).unwrap();
7985        assert!(code.len() >= 4, "I64Xor should emit at least 4 bytes");
7986    }
7987
7988    #[test]
7989    fn test_encode_i64_const_small_thumb2() {
7990        let encoder = ArmEncoder::new_thumb2();
7991        // Small constant: only needs MOVW for each half
7992        let op = ArmOp::I64Const {
7993            rdlo: Reg::R0,
7994            rdhi: Reg::R1,
7995            value: 42,
7996        };
7997        let code = encoder.encode(&op).unwrap();
7998        // MOVW R0, #42 (4 bytes) + MOVW R1, #0 (4 bytes) = 8 bytes minimum
7999        assert!(code.len() >= 8, "I64Const should emit at least 8 bytes");
8000    }
8001
8002    #[test]
8003    fn test_encode_i64_const_large_thumb2() {
8004        let encoder = ArmEncoder::new_thumb2();
8005        // Large constant: needs MOVW+MOVT for each half
8006        let op = ArmOp::I64Const {
8007            rdlo: Reg::R0,
8008            rdhi: Reg::R1,
8009            value: 0x1234_5678_9ABC_DEF0_u64 as i64,
8010        };
8011        let code = encoder.encode(&op).unwrap();
8012        // MOVW + MOVT for lo (8 bytes) + MOVW + MOVT for hi (8 bytes) = 16 bytes
8013        assert_eq!(
8014            code.len(),
8015            16,
8016            "I64Const with large value should be 16 bytes"
8017        );
8018    }
8019
8020    #[test]
8021    fn test_encode_i64_extend_i32_s_thumb2() {
8022        let encoder = ArmEncoder::new_thumb2();
8023        let op = ArmOp::I64ExtendI32S {
8024            rdlo: Reg::R0,
8025            rdhi: Reg::R1,
8026            rn: Reg::R0,
8027        };
8028        let code = encoder.encode(&op).unwrap();
8029        // When rdlo == rn, only ASR (4 bytes) is emitted
8030        assert_eq!(
8031            code.len(),
8032            4,
8033            "I64ExtendI32S (same reg) should be 4 bytes (ASR only)"
8034        );
8035    }
8036
8037    #[test]
8038    fn test_encode_i64_extend_i32_s_diff_reg_thumb2() {
8039        let encoder = ArmEncoder::new_thumb2();
8040        let op = ArmOp::I64ExtendI32S {
8041            rdlo: Reg::R0,
8042            rdhi: Reg::R1,
8043            rn: Reg::R2,
8044        };
8045        let code = encoder.encode(&op).unwrap();
8046        // MOV rdlo, rn (2 bytes for low regs) + ASR rdhi, rdlo, #31 (4 bytes) = 6 bytes
8047        assert!(
8048            code.len() >= 6,
8049            "I64ExtendI32S (diff reg) should be at least 6 bytes"
8050        );
8051    }
8052
8053    #[test]
8054    fn test_encode_i64_extend_i32_u_thumb2() {
8055        let encoder = ArmEncoder::new_thumb2();
8056        let op = ArmOp::I64ExtendI32U {
8057            rdlo: Reg::R0,
8058            rdhi: Reg::R1,
8059            rn: Reg::R0,
8060        };
8061        let code = encoder.encode(&op).unwrap();
8062        // When rdlo == rn, only MOV rdhi, #0 (2 bytes) is emitted
8063        assert_eq!(
8064            code.len(),
8065            2,
8066            "I64ExtendI32U (same reg) should be 2 bytes (MOV #0 only)"
8067        );
8068    }
8069
8070    #[test]
8071    fn test_encode_i32_wrap_i64_nop_thumb2() {
8072        let encoder = ArmEncoder::new_thumb2();
8073        // When rd == rnlo, should be a NOP
8074        let op = ArmOp::I32WrapI64 {
8075            rd: Reg::R0,
8076            rnlo: Reg::R0,
8077        };
8078        let code = encoder.encode(&op).unwrap();
8079        assert_eq!(code.len(), 2, "I32WrapI64 same reg should be NOP (2 bytes)");
8080        assert_eq!(code, vec![0x00, 0xBF]); // NOP
8081    }
8082
8083    #[test]
8084    fn test_encode_i32_wrap_i64_diff_reg_thumb2() {
8085        let encoder = ArmEncoder::new_thumb2();
8086        let op = ArmOp::I32WrapI64 {
8087            rd: Reg::R2,
8088            rnlo: Reg::R0,
8089        };
8090        let code = encoder.encode(&op).unwrap();
8091        // MOV R2, R0 (2 or 4 bytes)
8092        assert!(
8093            code.len() >= 2,
8094            "I32WrapI64 diff reg should emit at least 2 bytes"
8095        );
8096    }
8097
8098    #[test]
8099    fn test_encode_i64_eqz_thumb2() {
8100        let encoder = ArmEncoder::new_thumb2();
8101        let op = ArmOp::I64Eqz {
8102            rd: Reg::R0,
8103            rnlo: Reg::R0,
8104            rnhi: Reg::R1,
8105        };
8106        let code = encoder.encode(&op).unwrap();
8107        // Delegates to I64SetCondZ which is already encoded
8108        assert!(
8109            code.len() >= 6,
8110            "I64Eqz should emit at least 6 bytes for ORR+ITE+MOV+MOV"
8111        );
8112    }
8113
8114    #[test]
8115    fn test_encode_i64_eq_thumb2() {
8116        let encoder = ArmEncoder::new_thumb2();
8117        let op = ArmOp::I64Eq {
8118            rd: Reg::R0,
8119            rnlo: Reg::R0,
8120            rnhi: Reg::R1,
8121            rmlo: Reg::R2,
8122            rmhi: Reg::R3,
8123        };
8124        let code = encoder.encode(&op).unwrap();
8125        // Delegates to I64SetCond EQ: CMP lo + IT EQ + CMPEQ hi + ITE EQ + MOV 1 + MOV 0
8126        assert!(code.len() >= 10, "I64Eq should emit at least 10 bytes");
8127    }
8128
8129    #[test]
8130    fn test_encode_i64_ldr_thumb2() {
8131        let encoder = ArmEncoder::new_thumb2();
8132        let op = ArmOp::I64Ldr {
8133            rdlo: Reg::R0,
8134            rdhi: Reg::R1,
8135            addr: MemAddr::imm(Reg::SP, 0),
8136        };
8137        let code = encoder.encode(&op).unwrap();
8138        // Two LDR instructions (lo at offset, hi at offset+4)
8139        assert!(code.len() >= 4, "I64Ldr should emit at least 4 bytes");
8140    }
8141
8142    #[test]
8143    fn test_encode_i64_str_thumb2() {
8144        let encoder = ArmEncoder::new_thumb2();
8145        let op = ArmOp::I64Str {
8146            rdlo: Reg::R0,
8147            rdhi: Reg::R1,
8148            addr: MemAddr::imm(Reg::SP, 0),
8149        };
8150        let code = encoder.encode(&op).unwrap();
8151        // Two STR instructions (lo at offset, hi at offset+4)
8152        assert!(code.len() >= 4, "I64Str should emit at least 4 bytes");
8153    }
8154
8155    #[test]
8156    fn test_encode_i64_all_comparisons_thumb2() {
8157        let encoder = ArmEncoder::new_thumb2();
8158
8159        let ops = vec![
8160            ArmOp::I64Ne {
8161                rd: Reg::R0,
8162                rnlo: Reg::R0,
8163                rnhi: Reg::R1,
8164                rmlo: Reg::R2,
8165                rmhi: Reg::R3,
8166            },
8167            ArmOp::I64LtS {
8168                rd: Reg::R0,
8169                rnlo: Reg::R0,
8170                rnhi: Reg::R1,
8171                rmlo: Reg::R2,
8172                rmhi: Reg::R3,
8173            },
8174            ArmOp::I64LtU {
8175                rd: Reg::R0,
8176                rnlo: Reg::R0,
8177                rnhi: Reg::R1,
8178                rmlo: Reg::R2,
8179                rmhi: Reg::R3,
8180            },
8181            ArmOp::I64LeS {
8182                rd: Reg::R0,
8183                rnlo: Reg::R0,
8184                rnhi: Reg::R1,
8185                rmlo: Reg::R2,
8186                rmhi: Reg::R3,
8187            },
8188            ArmOp::I64LeU {
8189                rd: Reg::R0,
8190                rnlo: Reg::R0,
8191                rnhi: Reg::R1,
8192                rmlo: Reg::R2,
8193                rmhi: Reg::R3,
8194            },
8195            ArmOp::I64GtS {
8196                rd: Reg::R0,
8197                rnlo: Reg::R0,
8198                rnhi: Reg::R1,
8199                rmlo: Reg::R2,
8200                rmhi: Reg::R3,
8201            },
8202            ArmOp::I64GtU {
8203                rd: Reg::R0,
8204                rnlo: Reg::R0,
8205                rnhi: Reg::R1,
8206                rmlo: Reg::R2,
8207                rmhi: Reg::R3,
8208            },
8209            ArmOp::I64GeS {
8210                rd: Reg::R0,
8211                rnlo: Reg::R0,
8212                rnhi: Reg::R1,
8213                rmlo: Reg::R2,
8214                rmhi: Reg::R3,
8215            },
8216            ArmOp::I64GeU {
8217                rd: Reg::R0,
8218                rnlo: Reg::R0,
8219                rnhi: Reg::R1,
8220                rmlo: Reg::R2,
8221                rmhi: Reg::R3,
8222            },
8223        ];
8224
8225        for op in &ops {
8226            let code = encoder.encode(op).unwrap();
8227            assert!(
8228                code.len() >= 8,
8229                "i64 comparison {:?} should emit at least 8 bytes, got {}",
8230                op,
8231                code.len()
8232            );
8233        }
8234    }
8235
8236    #[test]
8237    fn test_encode_i64_const_zero_thumb2() {
8238        let encoder = ArmEncoder::new_thumb2();
8239        let op = ArmOp::I64Const {
8240            rdlo: Reg::R0,
8241            rdhi: Reg::R1,
8242            value: 0,
8243        };
8244        let code = encoder.encode(&op).unwrap();
8245        // MOVW R0, #0 (4 bytes) + MOVW R1, #0 (4 bytes) = 8 bytes
8246        assert_eq!(code.len(), 8, "I64Const(0) should be 8 bytes");
8247    }
8248
8249    #[test]
8250    fn test_encode_i64_const_negative_one_thumb2() {
8251        let encoder = ArmEncoder::new_thumb2();
8252        let op = ArmOp::I64Const {
8253            rdlo: Reg::R0,
8254            rdhi: Reg::R1,
8255            value: -1, // 0xFFFF_FFFF_FFFF_FFFF
8256        };
8257        let code = encoder.encode(&op).unwrap();
8258        // MOVW + MOVT for lo (8 bytes) + MOVW + MOVT for hi (8 bytes) = 16 bytes
8259        assert_eq!(code.len(), 16, "I64Const(-1) should be 16 bytes");
8260    }
8261
8262    // =========================================================================
8263    // Sub-word load/store encoding tests
8264    // =========================================================================
8265
8266    #[test]
8267    fn test_encode_ldrb_arm32() {
8268        let encoder = ArmEncoder::new_arm32();
8269        let op = ArmOp::Ldrb {
8270            rd: Reg::R0,
8271            addr: MemAddr::imm(Reg::R1, 4),
8272        };
8273        let code = encoder.encode(&op).unwrap();
8274        assert_eq!(code.len(), 4, "ARM32 LDRB should be 4 bytes");
8275        // LDRB R0, [R1, #4] = 0xE5D10004
8276        let encoded = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
8277        assert_eq!(encoded, 0xE5D10004, "Should encode LDRB R0, [R1, #4]");
8278    }
8279
8280    #[test]
8281    fn test_encode_strb_arm32() {
8282        let encoder = ArmEncoder::new_arm32();
8283        let op = ArmOp::Strb {
8284            rd: Reg::R0,
8285            addr: MemAddr::imm(Reg::R1, 0),
8286        };
8287        let code = encoder.encode(&op).unwrap();
8288        assert_eq!(code.len(), 4, "ARM32 STRB should be 4 bytes");
8289        // STRB R0, [R1, #0] = 0xE5C10000
8290        let encoded = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
8291        assert_eq!(encoded, 0xE5C10000, "Should encode STRB R0, [R1, #0]");
8292    }
8293
8294    #[test]
8295    fn test_encode_ldrh_arm32() {
8296        let encoder = ArmEncoder::new_arm32();
8297        let op = ArmOp::Ldrh {
8298            rd: Reg::R0,
8299            addr: MemAddr::imm(Reg::R1, 2),
8300        };
8301        let code = encoder.encode(&op).unwrap();
8302        assert_eq!(code.len(), 4, "ARM32 LDRH should be 4 bytes");
8303    }
8304
8305    #[test]
8306    fn test_encode_strh_arm32() {
8307        let encoder = ArmEncoder::new_arm32();
8308        let op = ArmOp::Strh {
8309            rd: Reg::R0,
8310            addr: MemAddr::imm(Reg::R1, 0),
8311        };
8312        let code = encoder.encode(&op).unwrap();
8313        assert_eq!(code.len(), 4, "ARM32 STRH should be 4 bytes");
8314    }
8315
8316    #[test]
8317    fn test_encode_ldrsb_arm32() {
8318        let encoder = ArmEncoder::new_arm32();
8319        let op = ArmOp::Ldrsb {
8320            rd: Reg::R0,
8321            addr: MemAddr::imm(Reg::R1, 0),
8322        };
8323        let code = encoder.encode(&op).unwrap();
8324        assert_eq!(code.len(), 4, "ARM32 LDRSB should be 4 bytes");
8325    }
8326
8327    #[test]
8328    fn test_encode_ldrsh_arm32() {
8329        let encoder = ArmEncoder::new_arm32();
8330        let op = ArmOp::Ldrsh {
8331            rd: Reg::R0,
8332            addr: MemAddr::imm(Reg::R1, 0),
8333        };
8334        let code = encoder.encode(&op).unwrap();
8335        assert_eq!(code.len(), 4, "ARM32 LDRSH should be 4 bytes");
8336    }
8337
8338    #[test]
8339    fn test_encode_ldrb_thumb2_16bit() {
8340        let encoder = ArmEncoder::new_thumb2();
8341        let op = ArmOp::Ldrb {
8342            rd: Reg::R0,
8343            addr: MemAddr::imm(Reg::R1, 4),
8344        };
8345        let code = encoder.encode(&op).unwrap();
8346        // Low registers + small offset -> 16-bit encoding
8347        assert_eq!(
8348            code.len(),
8349            2,
8350            "Thumb-2 LDRB with small offset should be 16-bit"
8351        );
8352    }
8353
8354    #[test]
8355    fn test_encode_ldrb_thumb2_32bit() {
8356        let encoder = ArmEncoder::new_thumb2();
8357        let op = ArmOp::Ldrb {
8358            rd: Reg::R0,
8359            addr: MemAddr::imm(Reg::R1, 100), // offset > 31 needs 32-bit
8360        };
8361        let code = encoder.encode(&op).unwrap();
8362        assert_eq!(
8363            code.len(),
8364            4,
8365            "Thumb-2 LDRB with large offset should be 32-bit"
8366        );
8367    }
8368
8369    #[test]
8370    fn test_encode_strb_thumb2_16bit() {
8371        let encoder = ArmEncoder::new_thumb2();
8372        let op = ArmOp::Strb {
8373            rd: Reg::R0,
8374            addr: MemAddr::imm(Reg::R1, 10),
8375        };
8376        let code = encoder.encode(&op).unwrap();
8377        assert_eq!(
8378            code.len(),
8379            2,
8380            "Thumb-2 STRB with small offset should be 16-bit"
8381        );
8382    }
8383
8384    #[test]
8385    fn test_encode_ldrh_thumb2_16bit() {
8386        let encoder = ArmEncoder::new_thumb2();
8387        let op = ArmOp::Ldrh {
8388            rd: Reg::R0,
8389            addr: MemAddr::imm(Reg::R1, 4), // offset aligned to 2, <= 62
8390        };
8391        let code = encoder.encode(&op).unwrap();
8392        assert_eq!(
8393            code.len(),
8394            2,
8395            "Thumb-2 LDRH with small aligned offset should be 16-bit"
8396        );
8397    }
8398
8399    #[test]
8400    fn test_encode_strh_thumb2_16bit() {
8401        let encoder = ArmEncoder::new_thumb2();
8402        let op = ArmOp::Strh {
8403            rd: Reg::R0,
8404            addr: MemAddr::imm(Reg::R1, 4),
8405        };
8406        let code = encoder.encode(&op).unwrap();
8407        assert_eq!(
8408            code.len(),
8409            2,
8410            "Thumb-2 STRH with small aligned offset should be 16-bit"
8411        );
8412    }
8413
8414    #[test]
8415    fn test_encode_ldrsb_thumb2() {
8416        let encoder = ArmEncoder::new_thumb2();
8417        let op = ArmOp::Ldrsb {
8418            rd: Reg::R0,
8419            addr: MemAddr::imm(Reg::R1, 0),
8420        };
8421        let code = encoder.encode(&op).unwrap();
8422        // LDRSB has no 16-bit immediate form, always 32-bit
8423        assert_eq!(code.len(), 4, "Thumb-2 LDRSB should be 32-bit");
8424    }
8425
8426    #[test]
8427    fn test_encode_ldrsh_thumb2() {
8428        let encoder = ArmEncoder::new_thumb2();
8429        let op = ArmOp::Ldrsh {
8430            rd: Reg::R0,
8431            addr: MemAddr::imm(Reg::R1, 0),
8432        };
8433        let code = encoder.encode(&op).unwrap();
8434        assert_eq!(code.len(), 4, "Thumb-2 LDRSH should be 32-bit");
8435    }
8436
8437    #[test]
8438    fn test_encode_memory_size_thumb2() {
8439        let encoder = ArmEncoder::new_thumb2();
8440        let op = ArmOp::MemorySize { rd: Reg::R0 };
8441        let code = encoder.encode(&op).unwrap();
8442        // R0 and R10 are not both low registers, so this needs careful handling
8443        assert!(!code.is_empty(), "MemorySize should produce code");
8444    }
8445
8446    #[test]
8447    fn test_encode_memory_grow_thumb2() {
8448        let encoder = ArmEncoder::new_thumb2();
8449        let op = ArmOp::MemoryGrow {
8450            rd: Reg::R0,
8451            rn: Reg::R0,
8452        };
8453        let code = encoder.encode(&op).unwrap();
8454        assert_eq!(code.len(), 4, "MemoryGrow (MVN) should be 32-bit Thumb-2");
8455    }
8456
8457    #[test]
8458    fn test_encode_subword_reg_offset_thumb2() {
8459        let encoder = ArmEncoder::new_thumb2();
8460
8461        // LDRB with register offset
8462        let op = ArmOp::Ldrb {
8463            rd: Reg::R0,
8464            addr: MemAddr::reg(Reg::R1, Reg::R2),
8465        };
8466        let code = encoder.encode(&op).unwrap();
8467        assert_eq!(
8468            code.len(),
8469            4,
8470            "Thumb-2 LDRB with reg offset should be 32-bit"
8471        );
8472
8473        // STRB with register offset
8474        let op = ArmOp::Strb {
8475            rd: Reg::R0,
8476            addr: MemAddr::reg(Reg::R1, Reg::R2),
8477        };
8478        let code = encoder.encode(&op).unwrap();
8479        assert_eq!(
8480            code.len(),
8481            4,
8482            "Thumb-2 STRB with reg offset should be 32-bit"
8483        );
8484
8485        // LDRH with register offset
8486        let op = ArmOp::Ldrh {
8487            rd: Reg::R0,
8488            addr: MemAddr::reg(Reg::R1, Reg::R2),
8489        };
8490        let code = encoder.encode(&op).unwrap();
8491        assert_eq!(
8492            code.len(),
8493            4,
8494            "Thumb-2 LDRH with reg offset should be 32-bit"
8495        );
8496
8497        // STRH with register offset
8498        let op = ArmOp::Strh {
8499            rd: Reg::R0,
8500            addr: MemAddr::reg(Reg::R1, Reg::R2),
8501        };
8502        let code = encoder.encode(&op).unwrap();
8503        assert_eq!(
8504            code.len(),
8505            4,
8506            "Thumb-2 STRH with reg offset should be 32-bit"
8507        );
8508    }
8509
8510    #[test]
8511    fn test_encode_subword_reg_imm_offset_thumb2() {
8512        let encoder = ArmEncoder::new_thumb2();
8513
8514        // LDRB with both register and immediate offset
8515        let op = ArmOp::Ldrb {
8516            rd: Reg::R0,
8517            addr: MemAddr::reg_imm(Reg::R1, Reg::R2, 4),
8518        };
8519        let code = encoder.encode(&op).unwrap();
8520        // ADD R12, R2, #4 (4 bytes) + LDRB R0, [R1, R12] (4 bytes) = 8 bytes
8521        assert_eq!(
8522            code.len(),
8523            8,
8524            "Thumb-2 LDRB with reg+imm offset should be 8 bytes"
8525        );
8526    }
8527
8528    // ========================================================================
8529    // Helium MVE encoding tests
8530    // ========================================================================
8531
8532    #[test]
8533    fn test_encode_mve_addi32_thumb2() {
8534        let encoder = ArmEncoder::new_thumb2();
8535        let op = ArmOp::MveAddI {
8536            qd: QReg::Q0,
8537            qn: QReg::Q1,
8538            qm: QReg::Q2,
8539            size: MveSize::S32,
8540        };
8541        let code = encoder.encode(&op).unwrap();
8542        assert_eq!(
8543            code.len(),
8544            4,
8545            "MVE VADD.I32 should be 4 bytes (Thumb-2 32-bit)"
8546        );
8547    }
8548
8549    #[test]
8550    fn test_encode_mve_subi16_thumb2() {
8551        let encoder = ArmEncoder::new_thumb2();
8552        let op = ArmOp::MveSubI {
8553            qd: QReg::Q0,
8554            qn: QReg::Q1,
8555            qm: QReg::Q2,
8556            size: MveSize::S16,
8557        };
8558        let code = encoder.encode(&op).unwrap();
8559        assert_eq!(code.len(), 4, "MVE VSUB.I16 should be 4 bytes");
8560    }
8561
8562    #[test]
8563    fn test_encode_mve_muli8_thumb2() {
8564        let encoder = ArmEncoder::new_thumb2();
8565        let op = ArmOp::MveMulI {
8566            qd: QReg::Q0,
8567            qn: QReg::Q1,
8568            qm: QReg::Q2,
8569            size: MveSize::S8,
8570        };
8571        let code = encoder.encode(&op).unwrap();
8572        assert_eq!(code.len(), 4, "MVE VMUL.I8 should be 4 bytes");
8573    }
8574
8575    #[test]
8576    fn test_encode_mve_bitwise_thumb2() {
8577        let encoder = ArmEncoder::new_thumb2();
8578
8579        let ops = vec![
8580            ArmOp::MveAnd {
8581                qd: QReg::Q0,
8582                qn: QReg::Q1,
8583                qm: QReg::Q2,
8584            },
8585            ArmOp::MveOrr {
8586                qd: QReg::Q0,
8587                qn: QReg::Q1,
8588                qm: QReg::Q2,
8589            },
8590            ArmOp::MveEor {
8591                qd: QReg::Q0,
8592                qn: QReg::Q1,
8593                qm: QReg::Q2,
8594            },
8595            ArmOp::MveBic {
8596                qd: QReg::Q0,
8597                qn: QReg::Q1,
8598                qm: QReg::Q2,
8599            },
8600        ];
8601        for op in ops {
8602            let code = encoder.encode(&op).unwrap();
8603            assert_eq!(code.len(), 4, "MVE bitwise op should be 4 bytes");
8604        }
8605    }
8606
8607    #[test]
8608    fn test_encode_mve_mvn_thumb2() {
8609        let encoder = ArmEncoder::new_thumb2();
8610        let op = ArmOp::MveMvn {
8611            qd: QReg::Q0,
8612            qm: QReg::Q1,
8613        };
8614        let code = encoder.encode(&op).unwrap();
8615        assert_eq!(code.len(), 4, "MVE VMVN should be 4 bytes");
8616    }
8617
8618    #[test]
8619    fn test_encode_mve_load_store_thumb2() {
8620        let encoder = ArmEncoder::new_thumb2();
8621
8622        let load = ArmOp::MveLoad {
8623            qd: QReg::Q0,
8624            addr: MemAddr::imm(Reg::R0, 16),
8625        };
8626        let code = encoder.encode(&load).unwrap();
8627        assert_eq!(code.len(), 4, "MVE VLDRW.32 should be 4 bytes");
8628
8629        let store = ArmOp::MveStore {
8630            qd: QReg::Q1,
8631            addr: MemAddr::imm(Reg::R1, 0),
8632        };
8633        let code = encoder.encode(&store).unwrap();
8634        assert_eq!(code.len(), 4, "MVE VSTRW.32 should be 4 bytes");
8635    }
8636
8637    #[test]
8638    fn test_encode_mve_const_thumb2() {
8639        let encoder = ArmEncoder::new_thumb2();
8640        let op = ArmOp::MveConst {
8641            qd: QReg::Q0,
8642            bytes: [1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0],
8643        };
8644        let code = encoder.encode(&op).unwrap();
8645        // Should be 4 words of (MOVW R12 + VMOV Sn) = 4 * (4+4) = 32 bytes min
8646        // Some words with hi16=0 skip MOVT, so length varies
8647        assert!(
8648            code.len() >= 24,
8649            "MVE const should produce multiple instructions"
8650        );
8651    }
8652
8653    #[test]
8654    fn test_encode_mve_dup_thumb2() {
8655        let encoder = ArmEncoder::new_thumb2();
8656        let op = ArmOp::MveDup {
8657            qd: QReg::Q0,
8658            rn: Reg::R0,
8659            size: MveSize::S32,
8660        };
8661        let code = encoder.encode(&op).unwrap();
8662        assert_eq!(code.len(), 4, "MVE VDUP.32 should be 4 bytes");
8663    }
8664
8665    #[test]
8666    fn test_encode_mve_extract_lane_thumb2() {
8667        let encoder = ArmEncoder::new_thumb2();
8668        let op = ArmOp::MveExtractLane {
8669            rd: Reg::R0,
8670            qn: QReg::Q1,
8671            lane: 2,
8672            size: MveSize::S32,
8673        };
8674        let code = encoder.encode(&op).unwrap();
8675        assert_eq!(code.len(), 4, "MVE extract lane should be 4 bytes");
8676    }
8677
8678    #[test]
8679    fn test_encode_mve_insert_lane_thumb2() {
8680        let encoder = ArmEncoder::new_thumb2();
8681        let op = ArmOp::MveInsertLane {
8682            qd: QReg::Q0,
8683            rn: Reg::R1,
8684            lane: 3,
8685            size: MveSize::S32,
8686        };
8687        let code = encoder.encode(&op).unwrap();
8688        assert_eq!(code.len(), 4, "MVE insert lane should be 4 bytes");
8689    }
8690
8691    #[test]
8692    fn test_encode_mve_addf32_thumb2() {
8693        let encoder = ArmEncoder::new_thumb2();
8694        let op = ArmOp::MveAddF32 {
8695            qd: QReg::Q0,
8696            qn: QReg::Q1,
8697            qm: QReg::Q2,
8698        };
8699        let code = encoder.encode(&op).unwrap();
8700        assert_eq!(code.len(), 4, "MVE VADD.F32 should be 4 bytes");
8701    }
8702
8703    #[test]
8704    fn test_encode_mve_divf32_thumb2() {
8705        let encoder = ArmEncoder::new_thumb2();
8706        let op = ArmOp::MveDivF32 {
8707            qd: QReg::Q0,
8708            qn: QReg::Q1,
8709            qm: QReg::Q2,
8710        };
8711        let code = encoder.encode(&op).unwrap();
8712        // Lane-wise: 4 x VDIV.F32 = 4 x 4 = 16 bytes
8713        assert_eq!(
8714            code.len(),
8715            16,
8716            "MVE VDIV.F32 (lane-wise) should be 16 bytes"
8717        );
8718    }
8719
8720    #[test]
8721    fn test_encode_mve_sqrtf32_thumb2() {
8722        let encoder = ArmEncoder::new_thumb2();
8723        let op = ArmOp::MveSqrtF32 {
8724            qd: QReg::Q0,
8725            qm: QReg::Q1,
8726        };
8727        let code = encoder.encode(&op).unwrap();
8728        // Lane-wise: 4 x VSQRT.F32 = 4 x 4 = 16 bytes
8729        assert_eq!(
8730            code.len(),
8731            16,
8732            "MVE VSQRT.F32 (lane-wise) should be 16 bytes"
8733        );
8734    }
8735
8736    #[test]
8737    fn test_encode_mve_negf32_thumb2() {
8738        let encoder = ArmEncoder::new_thumb2();
8739        let op = ArmOp::MveNegF32 {
8740            qd: QReg::Q0,
8741            qm: QReg::Q1,
8742        };
8743        let code = encoder.encode(&op).unwrap();
8744        assert_eq!(code.len(), 4, "MVE VNEG.F32 should be 4 bytes");
8745    }
8746
8747    #[test]
8748    fn test_encode_mve_absf32_thumb2() {
8749        let encoder = ArmEncoder::new_thumb2();
8750        let op = ArmOp::MveAbsF32 {
8751            qd: QReg::Q0,
8752            qm: QReg::Q1,
8753        };
8754        let code = encoder.encode(&op).unwrap();
8755        assert_eq!(code.len(), 4, "MVE VABS.F32 should be 4 bytes");
8756    }
8757
8758    #[test]
8759    fn test_encode_mve_different_qregs() {
8760        let encoder = ArmEncoder::new_thumb2();
8761
8762        // Test that different Q-register numbers produce different encodings
8763        let op1 = ArmOp::MveAddI {
8764            qd: QReg::Q0,
8765            qn: QReg::Q0,
8766            qm: QReg::Q0,
8767            size: MveSize::S32,
8768        };
8769        let op2 = ArmOp::MveAddI {
8770            qd: QReg::Q3,
8771            qn: QReg::Q5,
8772            qm: QReg::Q7,
8773            size: MveSize::S32,
8774        };
8775        let code1 = encoder.encode(&op1).unwrap();
8776        let code2 = encoder.encode(&op2).unwrap();
8777        assert_ne!(
8778            code1, code2,
8779            "Different Q-registers should produce different encodings"
8780        );
8781    }
8782
8783    #[test]
8784    fn test_encode_mve_arm32_nop() {
8785        // MVE instructions on ARM32 encoder should produce NOP (only Thumb-2 supported)
8786        let encoder = ArmEncoder::new_arm32();
8787        let op = ArmOp::MveAddI {
8788            qd: QReg::Q0,
8789            qn: QReg::Q1,
8790            qm: QReg::Q2,
8791            size: MveSize::S32,
8792        };
8793        let code = encoder.encode(&op).unwrap();
8794        assert_eq!(code.len(), 4, "ARM32 MVE should be 4 bytes (NOP)");
8795        // NOP in ARM32 is 0xE1A00000 (MOV R0, R0)
8796        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
8797        assert_eq!(instr, 0xE1A00000, "ARM32 MVE should encode as NOP");
8798    }
8799}