Skip to main content

synth_backend/
arm_encoder.rs

1//! ARM Code Encoder - Converts ARM instructions to binary machine code
2//!
3//! Generates ARM32/Thumb-2 machine code from ARM instruction structures
4
5use synth_core::Result;
6use synth_core::target::FPUPrecision;
7use synth_synthesis::contracts::encoding as encoding_contracts;
8use synth_synthesis::{ArmOp, MemAddr, MveSize, Operand2, QReg, Reg, VfpReg};
9
10/// ARM instruction encoding
11pub struct ArmEncoder {
12    /// Use Thumb mode (vs ARM mode)
13    thumb_mode: bool,
14    /// FPU capability for VFP instruction encoding
15    #[allow(dead_code)]
16    fpu: Option<FPUPrecision>,
17}
18
19impl ArmEncoder {
20    /// Create a new ARM encoder in ARM32 mode
21    pub fn new_arm32() -> Self {
22        Self {
23            thumb_mode: false,
24            fpu: None,
25        }
26    }
27
28    /// Create a new ARM encoder in Thumb-2 mode
29    pub fn new_thumb2() -> Self {
30        Self {
31            thumb_mode: true,
32            fpu: None,
33        }
34    }
35
36    /// Create a new Thumb-2 encoder with FPU capability
37    pub fn new_thumb2_with_fpu(fpu: Option<FPUPrecision>) -> Self {
38        Self {
39            thumb_mode: true,
40            fpu,
41        }
42    }
43
44    /// Encode a single ARM instruction to bytes
45    pub fn encode(&self, op: &ArmOp) -> Result<Vec<u8>> {
46        if self.thumb_mode {
47            self.encode_thumb(op)
48        } else {
49            self.encode_arm(op)
50        }
51    }
52
53    /// Encode an ARM instruction in ARM32 mode (32-bit instructions)
54    /// #206: encode an ARM32 (A32) load/store whose address uses a register
55    /// offset (`[rn, rm{, #off}]`). Returns `None` for ops with no register
56    /// offset (the caller falls through to the immediate-form arms). Computes
57    /// `ip = base + rm` then re-encodes the op against `[ip, #off]`, which works
58    /// uniformly for word/byte/halfword/signed forms. IP (R12) is the scratch
59    /// register the selector already treats as clobberable across memory ops.
60    fn encode_arm_reg_offset_mem(&self, op: &ArmOp) -> Result<Option<Vec<u8>>> {
61        use synth_synthesis::Reg;
62        let addr = match op {
63            ArmOp::Ldr { addr, .. }
64            | ArmOp::Str { addr, .. }
65            | ArmOp::Ldrb { addr, .. }
66            | ArmOp::Strb { addr, .. }
67            | ArmOp::Ldrh { addr, .. }
68            | ArmOp::Strh { addr, .. }
69            | ArmOp::Ldrsb { addr, .. }
70            | ArmOp::Ldrsh { addr, .. } => addr,
71            _ => return Ok(None),
72        };
73        let Some(rm) = addr.offset_reg else {
74            return Ok(None);
75        };
76        let ip = Reg::R12;
77        // ADD ip, base, rm  (cond=AL, opcode=ADD, S=0, register operand2)
78        let add: u32 = 0xE0800000
79            | (reg_to_bits(&addr.base) << 16)
80            | (reg_to_bits(&ip) << 12)
81            | reg_to_bits(&rm);
82        let mut bytes = add.to_le_bytes().to_vec();
83        // Re-encode the op against [ip, #off] (immediate form → no offset_reg,
84        // so this recursion hits the immediate arms, not this helper again).
85        let imm_addr = MemAddr::imm(ip, addr.offset);
86        let imm_op = match op {
87            ArmOp::Ldr { rd, .. } => ArmOp::Ldr {
88                rd: *rd,
89                addr: imm_addr,
90            },
91            ArmOp::Str { rd, .. } => ArmOp::Str {
92                rd: *rd,
93                addr: imm_addr,
94            },
95            ArmOp::Ldrb { rd, .. } => ArmOp::Ldrb {
96                rd: *rd,
97                addr: imm_addr,
98            },
99            ArmOp::Strb { rd, .. } => ArmOp::Strb {
100                rd: *rd,
101                addr: imm_addr,
102            },
103            ArmOp::Ldrh { rd, .. } => ArmOp::Ldrh {
104                rd: *rd,
105                addr: imm_addr,
106            },
107            ArmOp::Strh { rd, .. } => ArmOp::Strh {
108                rd: *rd,
109                addr: imm_addr,
110            },
111            ArmOp::Ldrsb { rd, .. } => ArmOp::Ldrsb {
112                rd: *rd,
113                addr: imm_addr,
114            },
115            ArmOp::Ldrsh { rd, .. } => ArmOp::Ldrsh {
116                rd: *rd,
117                addr: imm_addr,
118            },
119            _ => unreachable!(),
120        };
121        bytes.extend(self.encode_arm(&imm_op)?);
122        Ok(Some(bytes))
123    }
124
125    fn encode_arm(&self, op: &ArmOp) -> Result<Vec<u8>> {
126        // #206: ARM32 register-offset loads/stores. `encode_mem_addr` only
127        // returns the 12-bit immediate, so the immediate-form arms below
128        // silently DROP `addr.offset_reg` — a runtime address index vanished,
129        // turning `ldr rd,[rn,rm,#off]` into `ldr rd,[rn,#off]` (the access went
130        // to the wrong address). Compute the effective base into IP and re-encode
131        // against `[ip, #off]`, which is uniform for word/byte/halfword/signed.
132        if let Some(bytes) = self.encode_arm_reg_offset_mem(op)? {
133            return Ok(bytes);
134        }
135        let instr: u32 = match op {
136            // Data processing instructions
137            ArmOp::Add { rd, rn, op2 } => {
138                let rd_bits = reg_to_bits(rd);
139                let rn_bits = reg_to_bits(rn);
140                let (op2_bits, i_flag) = encode_operand2(op2);
141
142                // ADD encoding: cond(4) | 00 | I(1) | 0100 | S(1) | Rn(4) | Rd(4) | operand2(12)
143                0xE0800000 // condition=always(E), opcode=ADD(0100), S=0
144                    | (i_flag << 25)
145                    | (rn_bits << 16)
146                    | (rd_bits << 12)
147                    | op2_bits
148            }
149
150            ArmOp::Sub { rd, rn, op2 } => {
151                let rd_bits = reg_to_bits(rd);
152                let rn_bits = reg_to_bits(rn);
153                let (op2_bits, i_flag) = encode_operand2(op2);
154
155                // SUB encoding: opcode=0010
156                0xE0400000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
157            }
158
159            // i64 support: ADDS, ADC, SUBS, SBC for ARM32
160            ArmOp::Adds { rd, rn, op2 } => {
161                let rd_bits = reg_to_bits(rd);
162                let rn_bits = reg_to_bits(rn);
163                let (op2_bits, i_flag) = encode_operand2(op2);
164
165                // ADDS encoding: opcode=0100, S=1
166                0xE0900000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
167            }
168
169            ArmOp::Adc { rd, rn, op2 } => {
170                let rd_bits = reg_to_bits(rd);
171                let rn_bits = reg_to_bits(rn);
172                let (op2_bits, i_flag) = encode_operand2(op2);
173
174                // ADC encoding: opcode=0101
175                0xE0A00000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
176            }
177
178            ArmOp::Subs { rd, rn, op2 } => {
179                let rd_bits = reg_to_bits(rd);
180                let rn_bits = reg_to_bits(rn);
181                let (op2_bits, i_flag) = encode_operand2(op2);
182
183                // SUBS encoding: opcode=0010, S=1
184                0xE0500000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
185            }
186
187            ArmOp::Sbc { rd, rn, op2 } => {
188                let rd_bits = reg_to_bits(rd);
189                let rn_bits = reg_to_bits(rn);
190                let (op2_bits, i_flag) = encode_operand2(op2);
191
192                // SBC encoding: opcode=0110
193                0xE0C00000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
194            }
195
196            ArmOp::Mul { rd, rn, rm } => {
197                let rd_bits = reg_to_bits(rd);
198                let rn_bits = reg_to_bits(rn);
199                let rm_bits = reg_to_bits(rm);
200
201                // MUL encoding: cond(4) | 000000 | A(1) | S(1) | Rd(4) | Rn(4) | Rs(4) | 1001 | Rm(4)
202                0xE0000090 | (rd_bits << 16) | (rn_bits << 8) | rm_bits
203            }
204
205            ArmOp::Sdiv { rd, rn, rm } => {
206                let rd_bits = reg_to_bits(rd);
207                let rn_bits = reg_to_bits(rn);
208                let rm_bits = reg_to_bits(rm);
209
210                // SDIV encoding: cond(4) | 01110001 | Rd(4) | 1111 | Rm(4) | 0001 | Rn(4)
211                // ARMv7-M and above
212                0xE710F010 | (rd_bits << 16) | (rm_bits << 8) | rn_bits
213            }
214
215            ArmOp::Udiv { rd, rn, rm } => {
216                let rd_bits = reg_to_bits(rd);
217                let rn_bits = reg_to_bits(rn);
218                let rm_bits = reg_to_bits(rm);
219
220                // UDIV encoding: cond(4) | 01110011 | Rd(4) | 1111 | Rm(4) | 0001 | Rn(4)
221                // ARMv7-M and above
222                0xE730F010 | (rd_bits << 16) | (rm_bits << 8) | rn_bits
223            }
224
225            ArmOp::Mls { rd, rn, rm, ra } => {
226                let rd_bits = reg_to_bits(rd);
227                let rn_bits = reg_to_bits(rn);
228                let rm_bits = reg_to_bits(rm);
229                let ra_bits = reg_to_bits(ra);
230
231                // MLS encoding: cond(4) | 00000110 | Rd(4) | Ra(4) | Rm(4) | 1001 | Rn(4)
232                // Rd = Ra - (Rn * Rm)
233                0xE0600090 | (rd_bits << 16) | (ra_bits << 12) | (rm_bits << 8) | rn_bits
234            }
235
236            ArmOp::And { rd, rn, op2 } => {
237                let rd_bits = reg_to_bits(rd);
238                let rn_bits = reg_to_bits(rn);
239                let (op2_bits, i_flag) = encode_operand2(op2);
240
241                // AND encoding: opcode=0000
242                0xE0000000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
243            }
244
245            ArmOp::Orr { rd, rn, op2 } => {
246                let rd_bits = reg_to_bits(rd);
247                let rn_bits = reg_to_bits(rn);
248                let (op2_bits, i_flag) = encode_operand2(op2);
249
250                // ORR encoding: opcode=1100
251                0xE1800000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
252            }
253
254            ArmOp::Eor { rd, rn, op2 } => {
255                let rd_bits = reg_to_bits(rd);
256                let rn_bits = reg_to_bits(rn);
257                let (op2_bits, i_flag) = encode_operand2(op2);
258
259                // EOR encoding: opcode=0001
260                0xE0200000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
261            }
262
263            // Shift instructions
264            ArmOp::Lsl { rd, rn, shift } => {
265                let rd_bits = reg_to_bits(rd);
266                let rn_bits = reg_to_bits(rn);
267                let shift_bits = *shift & 0x1F;
268
269                // LSL encoding: MOV with shift
270                0xE1A00000 | (rd_bits << 12) | (shift_bits << 7) | rn_bits
271            }
272
273            ArmOp::Lsr { rd, rn, shift } => {
274                let rd_bits = reg_to_bits(rd);
275                let rn_bits = reg_to_bits(rn);
276                let shift_bits = *shift & 0x1F;
277
278                // LSR encoding
279                0xE1A00020 | (rd_bits << 12) | (shift_bits << 7) | rn_bits
280            }
281
282            ArmOp::Asr { rd, rn, shift } => {
283                let rd_bits = reg_to_bits(rd);
284                let rn_bits = reg_to_bits(rn);
285                let shift_bits = *shift & 0x1F;
286
287                // ASR encoding
288                0xE1A00040 | (rd_bits << 12) | (shift_bits << 7) | rn_bits
289            }
290
291            ArmOp::Ror { rd, rn, shift } => {
292                let rd_bits = reg_to_bits(rd);
293                let rn_bits = reg_to_bits(rn);
294                let shift_bits = *shift & 0x1F;
295
296                // ROR encoding: MOV with ROR shift
297                0xE1A00060 | (rd_bits << 12) | (shift_bits << 7) | rn_bits
298            }
299
300            // Register-based shifts (ARM32)
301            // LSL Rd, Rn, Rm: cond 0001101S 0000 Rd Rs 0001 Rn
302            ArmOp::LslReg { rd, rn, rm } => {
303                let rd_bits = reg_to_bits(rd);
304                let rn_bits = reg_to_bits(rn);
305                let rm_bits = reg_to_bits(rm);
306                0xE1A00010 | (rd_bits << 12) | (rm_bits << 8) | rn_bits
307            }
308            ArmOp::LsrReg { rd, rn, rm } => {
309                let rd_bits = reg_to_bits(rd);
310                let rn_bits = reg_to_bits(rn);
311                let rm_bits = reg_to_bits(rm);
312                0xE1A00030 | (rd_bits << 12) | (rm_bits << 8) | rn_bits
313            }
314            ArmOp::AsrReg { rd, rn, rm } => {
315                let rd_bits = reg_to_bits(rd);
316                let rn_bits = reg_to_bits(rn);
317                let rm_bits = reg_to_bits(rm);
318                0xE1A00050 | (rd_bits << 12) | (rm_bits << 8) | rn_bits
319            }
320            ArmOp::RorReg { rd, rn, rm } => {
321                let rd_bits = reg_to_bits(rd);
322                let rn_bits = reg_to_bits(rn);
323                let rm_bits = reg_to_bits(rm);
324                0xE1A00070 | (rd_bits << 12) | (rm_bits << 8) | rn_bits
325            }
326
327            // RSB (Reverse Subtract): Rd = imm - Rn
328            ArmOp::Rsb { rd, rn, imm } => {
329                let rd_bits = reg_to_bits(rd);
330                let rn_bits = reg_to_bits(rn);
331                // RSB encoding: cond(4) | 00 1 0011 S | Rn(4) | Rd(4) | imm12
332                // Opcode for RSB = 0011, I=1 (immediate), S=0
333                0xE2600000 | (rn_bits << 16) | (rd_bits << 12) | (*imm & 0xFF)
334            }
335
336            // Bit manipulation instructions
337            ArmOp::Clz { rd, rm } => {
338                let rd_bits = reg_to_bits(rd);
339                let rm_bits = reg_to_bits(rm);
340
341                // CLZ encoding: cond(4) | 00010110 | 1111 | Rd(4) | 1111 | 0001 | Rm(4)
342                // ARMv5T and above
343                0xE16F0F10 | (rd_bits << 12) | rm_bits
344            }
345
346            ArmOp::Rbit { rd, rm } => {
347                let rd_bits = reg_to_bits(rd);
348                let rm_bits = reg_to_bits(rm);
349
350                // RBIT encoding: cond(4) | 01101111 | 1111 | Rd(4) | 1111 | 0011 | Rm(4)
351                // ARMv6T2 and above
352                0xE6FF0F30 | (rd_bits << 12) | rm_bits
353            }
354
355            ArmOp::Sxtb { rd, rm } => {
356                let rd_bits = reg_to_bits(rd);
357                let rm_bits = reg_to_bits(rm);
358
359                // SXTB encoding: cond(4) | 01101010 | 1111 | Rd(4) | rotate(2) | 00 | 0111 | Rm(4)
360                // ARMv6 and above. rotate=00 for no rotation
361                0xE6AF0070 | (rd_bits << 12) | rm_bits
362            }
363
364            ArmOp::Sxth { rd, rm } => {
365                let rd_bits = reg_to_bits(rd);
366                let rm_bits = reg_to_bits(rm);
367
368                // SXTH encoding: cond(4) | 01101011 | 1111 | Rd(4) | rotate(2) | 00 | 0111 | Rm(4)
369                // ARMv6 and above. rotate=00 for no rotation
370                0xE6BF0070 | (rd_bits << 12) | rm_bits
371            }
372
373            // Move instructions
374            ArmOp::Mov { rd, op2 } => {
375                let rd_bits = reg_to_bits(rd);
376                let (op2_bits, i_flag) = encode_operand2(op2);
377
378                // MOV encoding: opcode=1101
379                0xE1A00000 | (i_flag << 25) | (rd_bits << 12) | op2_bits
380            }
381
382            ArmOp::Mvn { rd, op2 } => {
383                let rd_bits = reg_to_bits(rd);
384                let (op2_bits, i_flag) = encode_operand2(op2);
385
386                // MVN encoding: opcode=1111
387                0xE1E00000 | (i_flag << 25) | (rd_bits << 12) | op2_bits
388            }
389
390            // MOVW - Move Wide (ARM32)
391            // Encoding: cond(4) | 0011 0000 | imm4(4) | Rd(4) | imm12(12)
392            ArmOp::Movw { rd, imm16 } => {
393                let rd_bits = reg_to_bits(rd);
394                let imm4 = ((*imm16 as u32) >> 12) & 0xF;
395                let imm12 = (*imm16 as u32) & 0xFFF;
396                0xE3000000 | (imm4 << 16) | (rd_bits << 12) | imm12
397            }
398
399            // MOVT - Move Top (ARM32)
400            // Encoding: cond(4) | 0011 0100 | imm4(4) | Rd(4) | imm12(12)
401            ArmOp::Movt { rd, imm16 } => {
402                let rd_bits = reg_to_bits(rd);
403                let imm4 = ((*imm16 as u32) >> 12) & 0xF;
404                let imm12 = (*imm16 as u32) & 0xFFF;
405                0xE3400000 | (imm4 << 16) | (rd_bits << 12) | imm12
406            }
407
408            // Compare
409            ArmOp::Cmp { rn, op2 } => {
410                let rn_bits = reg_to_bits(rn);
411                let (op2_bits, i_flag) = encode_operand2(op2);
412
413                // CMP encoding: opcode=1010, S=1
414                0xE1500000 | (i_flag << 25) | (rn_bits << 16) | op2_bits
415            }
416
417            // Compare Negative (CMN) - computes Rn + op2 and sets flags
418            ArmOp::Cmn { rn, op2 } => {
419                let rn_bits = reg_to_bits(rn);
420                let (op2_bits, i_flag) = encode_operand2(op2);
421
422                // CMN encoding: opcode=1011, S=1
423                0xE1700000 | (i_flag << 25) | (rn_bits << 16) | op2_bits
424            }
425
426            // Load/Store
427            ArmOp::Ldr { rd, addr } => {
428                let rd_bits = reg_to_bits(rd);
429                let (base_bits, offset_bits) = encode_mem_addr(addr);
430
431                // LDR encoding: cond(4) | 01 | I(1) | P(1) | U(1) | B(1) | W(1) | L(1) | Rn(4) | Rd(4) | offset(12)
432                // P=1 (pre-indexed), U=1 (add offset), L=1 (load)
433                0xE5900000 | (base_bits << 16) | (rd_bits << 12) | offset_bits
434            }
435
436            ArmOp::Str { rd, addr } => {
437                let rd_bits = reg_to_bits(rd);
438                let (base_bits, offset_bits) = encode_mem_addr(addr);
439
440                // STR encoding: L=0 (store)
441                0xE5800000 | (base_bits << 16) | (rd_bits << 12) | offset_bits
442            }
443
444            // Sub-word loads (ARM32 encoding)
445            ArmOp::Ldrb { rd, addr } => {
446                let rd_bits = reg_to_bits(rd);
447                let (base_bits, offset_bits) = encode_mem_addr(addr);
448                // LDRB: LDR with B=1 (byte): cond|01|I|P|U|1|W|L|Rn|Rd|offset
449                0xE5D00000 | (base_bits << 16) | (rd_bits << 12) | offset_bits
450            }
451
452            ArmOp::Ldrsb { rd, addr } => {
453                let rd_bits = reg_to_bits(rd);
454                let (base_bits, offset_bits) = encode_mem_addr(addr);
455                // LDRSB (misc load): cond|000|P|U|1|W|1|Rn|Rd|imm4H|1101|imm4L
456                // Simplified with immediate offset
457                let offset_val = offset_bits & 0xFF;
458                let imm4h = (offset_val >> 4) & 0xF;
459                let imm4l = offset_val & 0xF;
460                0xE1D000D0 | (base_bits << 16) | (rd_bits << 12) | (imm4h << 8) | imm4l
461            }
462
463            ArmOp::Ldrh { rd, addr } => {
464                let rd_bits = reg_to_bits(rd);
465                let (base_bits, offset_bits) = encode_mem_addr(addr);
466                // LDRH (misc load): cond|000|P|U|1|W|1|Rn|Rd|imm4H|1011|imm4L
467                let offset_val = offset_bits & 0xFF;
468                let imm4h = (offset_val >> 4) & 0xF;
469                let imm4l = offset_val & 0xF;
470                0xE1D000B0 | (base_bits << 16) | (rd_bits << 12) | (imm4h << 8) | imm4l
471            }
472
473            ArmOp::Ldrsh { rd, addr } => {
474                let rd_bits = reg_to_bits(rd);
475                let (base_bits, offset_bits) = encode_mem_addr(addr);
476                // LDRSH (misc load): cond|000|P|U|1|W|1|Rn|Rd|imm4H|1111|imm4L
477                let offset_val = offset_bits & 0xFF;
478                let imm4h = (offset_val >> 4) & 0xF;
479                let imm4l = offset_val & 0xF;
480                0xE1D000F0 | (base_bits << 16) | (rd_bits << 12) | (imm4h << 8) | imm4l
481            }
482
483            // Sub-word stores (ARM32 encoding)
484            ArmOp::Strb { rd, addr } => {
485                let rd_bits = reg_to_bits(rd);
486                let (base_bits, offset_bits) = encode_mem_addr(addr);
487                // STRB: STR with B=1 (byte): cond|01|I|P|U|1|W|0|Rn|Rd|offset
488                0xE5C00000 | (base_bits << 16) | (rd_bits << 12) | offset_bits
489            }
490
491            ArmOp::Strh { rd, addr } => {
492                let rd_bits = reg_to_bits(rd);
493                let (base_bits, offset_bits) = encode_mem_addr(addr);
494                // STRH (misc store): cond|000|P|U|1|W|0|Rn|Rd|imm4H|1011|imm4L
495                let offset_val = offset_bits & 0xFF;
496                let imm4h = (offset_val >> 4) & 0xF;
497                let imm4l = offset_val & 0xF;
498                0xE1C000B0 | (base_bits << 16) | (rd_bits << 12) | (imm4h << 8) | imm4l
499            }
500
501            // Memory management (ARM32 encoding)
502            ArmOp::MemorySize { rd } => {
503                let rd_bits = reg_to_bits(rd);
504                // MOV rd, R10, LSR #16  (memory size in bytes / 65536 = pages)
505                // cond|000|1101|S|0000|Rd|shift5|type|0|Rm
506                // LSR #16: shift5=10000, type=01
507                0xE1A00820 | (rd_bits << 12) | 0x0A // Rm=R10, shift=16, LSR
508            }
509
510            ArmOp::MemoryGrow { rd, .. } => {
511                let rd_bits = reg_to_bits(rd);
512                // On embedded, always fail: MOV rd, #-1
513                0xE3E00000 | (rd_bits << 12) // MVN rd, #0 = MOV rd, #-1
514            }
515
516            // Label pseudo-instruction: emits no machine code
517            ArmOp::Label { .. } => {
518                return Ok(Vec::new());
519            }
520
521            // Branch instructions
522            ArmOp::B { label: _ } => {
523                // B encoding: cond(4) | 1010 | offset(24)
524                // Simplified: branch to offset 0 (will be patched by linker/resolver)
525                0xEA000000
526            }
527
528            // Conditional branch to label (generic)
529            ArmOp::Bcc { cond, label: _ } => {
530                use synth_synthesis::Condition;
531                let cond_bits: u32 = match cond {
532                    Condition::EQ => 0x0,
533                    Condition::NE => 0x1,
534                    Condition::HS => 0x2,
535                    Condition::LO => 0x3,
536                    Condition::HI => 0x8,
537                    Condition::LS => 0x9,
538                    Condition::GE => 0xA,
539                    Condition::LT => 0xB,
540                    Condition::GT => 0xC,
541                    Condition::LE => 0xD,
542                };
543                // B<cond> with offset 0 (will be patched)
544                (cond_bits << 28) | 0x0A000000
545            }
546
547            // BHS (Branch if Higher or Same) - used for bounds checking
548            ArmOp::Bhs { label: _ } => {
549                // BHS encoding: cond(2=HS) | 1010 | offset(24)
550                0x2A000000 // BHS with offset 0
551            }
552
553            // BLO (Branch if Lower) - complementary to BHS
554            ArmOp::Blo { label: _ } => {
555                // BLO encoding: cond(3=LO) | 1010 | offset(24)
556                0x3A000000 // BLO with offset 0
557            }
558
559            // Branch with numeric offset (in instructions)
560            // ARM32 B instruction: offset is in instructions, stored as words
561            // The offset is relative to PC+8 (due to ARM pipeline)
562            ArmOp::BOffset { offset } => {
563                // B encoding: cond(4) | 1010 | offset(24)
564                // Offset is signed, in words (4-byte units)
565                // ARM adds PC+8 to the offset, so we need to adjust:
566                // target = PC + 8 + (offset * 4)
567                // For backward branch of N instructions: offset = -(N + 2)
568                // wrapping_sub keeps the encoder total under fuzzing (#186): an
569                // extreme i32::MIN offset would otherwise overflow-panic; for any
570                // real branch offset this is identical to `- 2`.
571                let adjusted_offset = offset.wrapping_sub(2); // Account for PC+8
572                let offset_bits = (adjusted_offset as u32) & 0x00FFFFFF;
573                0xEA000000 | offset_bits
574            }
575
576            // Conditional branch with numeric offset
577            ArmOp::BCondOffset { cond, offset } => {
578                use synth_synthesis::Condition;
579                let cond_bits: u32 = match cond {
580                    Condition::EQ => 0x0,
581                    Condition::NE => 0x1,
582                    Condition::HS => 0x2,
583                    Condition::LO => 0x3,
584                    Condition::HI => 0x8,
585                    Condition::LS => 0x9,
586                    Condition::GE => 0xA,
587                    Condition::LT => 0xB,
588                    Condition::GT => 0xC,
589                    Condition::LE => 0xD,
590                };
591                // B<cond> encoding: cond(4) | 1010 | offset(24)
592                // wrapping_sub: total under fuzzing (#186), identical for real offsets.
593                let adjusted_offset = offset.wrapping_sub(2); // Account for PC+8
594                let offset_bits = (adjusted_offset as u32) & 0x00FFFFFF;
595                (cond_bits << 28) | 0x0A000000 | offset_bits
596            }
597
598            ArmOp::Bl { label: _ } => {
599                // BL encoding: cond(4) | 1011 | offset(24)
600                0xEB000000
601            }
602
603            ArmOp::Bx { rm } => {
604                let rm_bits = reg_to_bits(rm);
605
606                // BX encoding: cond(4) | 000100101111111111110001 | Rm(4)
607                0xE12FFF10 | rm_bits
608            }
609
610            ArmOp::Blx { rm } => {
611                let rm_bits = reg_to_bits(rm);
612
613                // BLX (register) encoding: cond(4) | 000100101111111111110011 | Rm(4)
614                0xE12FFF30 | rm_bits
615            }
616
617            ArmOp::Push { regs } => {
618                // STMDB SP!, {regs} encoding: cond(4) | 100100 | 10 | 1101 | register_list(16)
619                let mut reg_list: u32 = 0;
620                for r in regs {
621                    reg_list |= 1 << reg_to_bits(r);
622                }
623                0xE92D0000 | reg_list
624            }
625
626            ArmOp::Pop { regs } => {
627                // LDMIA SP!, {regs} encoding: cond(4) | 100010 | 11 | 1101 | register_list(16)
628                let mut reg_list: u32 = 0;
629                for r in regs {
630                    reg_list |= 1 << reg_to_bits(r);
631                }
632                0xE8BD0000 | reg_list
633            }
634
635            ArmOp::Nop => {
636                // NOP encoding: MOV R0, R0
637                0xE1A00000
638            }
639
640            ArmOp::Udf { imm } => {
641                // UDF (Undefined) encoding in ARM: 0xE7F000F0 | (imm12_hi << 8) | imm4_lo
642                // We only use imm8, so split into imm4_hi and imm4_lo
643                let imm8 = *imm as u32;
644                0xE7F000F0 | ((imm8 & 0xF0) << 4) | (imm8 & 0x0F)
645            }
646
647            // Pseudo-instructions for verification - encode as NOP
648            // These are used in formal verification but not actual code generation
649            ArmOp::Popcnt { .. } => {
650                // Population count pseudo-instruction
651                // Not a real ARM instruction, would be expanded to actual code
652                0xE1A00000 // NOP for now
653            }
654
655            ArmOp::SetCond { .. } => {
656                // Condition evaluation pseudo-instruction
657                // Not a real ARM instruction, would be expanded to actual code
658                0xE1A00000 // NOP for now
659            }
660
661            ArmOp::SelectMove { .. } => {
662                // Conditional move pseudo-instruction for ARM32
663                // Would use MOV{cond} instruction
664                0xE1A00000 // NOP for now
665            }
666
667            ArmOp::Select { .. } => {
668                // Select pseudo-instruction
669                // Not a real ARM instruction, would be expanded to conditional moves
670                0xE1A00000 // NOP for now
671            }
672
673            ArmOp::LocalGet { .. } => {
674                // Local variable get pseudo-instruction
675                // Not a real ARM instruction, would be expanded to memory access
676                0xE1A00000 // NOP for now
677            }
678
679            ArmOp::LocalSet { .. } => {
680                // Local variable set pseudo-instruction
681                // Not a real ARM instruction, would be expanded to memory access
682                0xE1A00000 // NOP for now
683            }
684
685            ArmOp::LocalTee { .. } => {
686                // Local variable tee pseudo-instruction
687                // Not a real ARM instruction, would be expanded to memory access
688                0xE1A00000 // NOP for now
689            }
690
691            ArmOp::GlobalGet { .. } => {
692                // Global variable get pseudo-instruction
693                // Not a real ARM instruction, would be expanded to memory access
694                0xE1A00000 // NOP for now
695            }
696
697            ArmOp::GlobalSet { .. } => {
698                // Global variable set pseudo-instruction
699                // Not a real ARM instruction, would be expanded to memory access
700                0xE1A00000 // NOP for now
701            }
702
703            ArmOp::BrTable { .. } => {
704                // Branch table pseudo-instruction
705                // Not a real ARM instruction, would be expanded to jump table
706                0xE1A00000 // NOP for now
707            }
708
709            ArmOp::Call { .. } => {
710                // Function call pseudo-instruction
711                // Not a real ARM instruction, would be expanded to BL
712                0xE1A00000 // NOP for now
713            }
714
715            ArmOp::CallIndirect { .. } => {
716                // Indirect function call pseudo-instruction
717                // Not a real ARM instruction, would be expanded to indirect branch
718                0xE1A00000 // NOP for now
719            }
720
721            // i64 pseudo-instructions (Phase 2) - encode as NOP for now
722            // Real compiler would expand these to multi-instruction sequences
723            ArmOp::I64Add { .. } => 0xE1A00000,        // NOP
724            ArmOp::I64Sub { .. } => 0xE1A00000,        // NOP
725            ArmOp::I64DivS { .. } => 0xE1A00000,       // NOP
726            ArmOp::I64DivU { .. } => 0xE1A00000,       // NOP
727            ArmOp::I64RemS { .. } => 0xE1A00000,       // NOP
728            ArmOp::I64RemU { .. } => 0xE1A00000,       // NOP
729            ArmOp::I64Clz { .. } => 0xE1A00000,        // NOP
730            ArmOp::I64Ctz { .. } => 0xE1A00000,        // NOP
731            ArmOp::I64Popcnt { .. } => 0xE1A00000,     // NOP
732            ArmOp::I64And { .. } => 0xE1A00000,        // NOP
733            ArmOp::I64Or { .. } => 0xE1A00000,         // NOP
734            ArmOp::I64Xor { .. } => 0xE1A00000,        // NOP
735            ArmOp::I64Eqz { .. } => 0xE1A00000,        // NOP
736            ArmOp::I64Eq { .. } => 0xE1A00000,         // NOP
737            ArmOp::I64Ne { .. } => 0xE1A00000,         // NOP
738            ArmOp::I64LtS { .. } => 0xE1A00000,        // NOP
739            ArmOp::I64LtU { .. } => 0xE1A00000,        // NOP
740            ArmOp::I64LeS { .. } => 0xE1A00000,        // NOP
741            ArmOp::I64LeU { .. } => 0xE1A00000,        // NOP
742            ArmOp::I64GtS { .. } => 0xE1A00000,        // NOP
743            ArmOp::I64GtU { .. } => 0xE1A00000,        // NOP
744            ArmOp::I64GeS { .. } => 0xE1A00000,        // NOP
745            ArmOp::I64GeU { .. } => 0xE1A00000,        // NOP
746            ArmOp::I64Const { .. } => 0xE1A00000,      // NOP
747            ArmOp::I64Ldr { .. } => 0xE1A00000,        // NOP
748            ArmOp::I64Str { .. } => 0xE1A00000,        // NOP
749            ArmOp::I64ExtendI32S { .. } => 0xE1A00000, // NOP
750            ArmOp::I64ExtendI32U { .. } => 0xE1A00000, // NOP
751            ArmOp::I64Extend8S { .. } => 0xE1A00000,   // NOP (Thumb-2 only)
752            ArmOp::I64Extend16S { .. } => 0xE1A00000,  // NOP (Thumb-2 only)
753            ArmOp::I64Extend32S { .. } => 0xE1A00000,  // NOP (Thumb-2 only)
754            ArmOp::I32WrapI64 { .. } => 0xE1A00000,    // NOP
755
756            // f32 VFP single-precision instructions
757            ArmOp::F32Add { sd, sn, sm } => encode_vfp_3reg(0xEE300A00, sd, sn, sm)?,
758            ArmOp::F32Sub { sd, sn, sm } => encode_vfp_3reg(0xEE300A40, sd, sn, sm)?,
759            ArmOp::F32Mul { sd, sn, sm } => encode_vfp_3reg(0xEE200A00, sd, sn, sm)?,
760            ArmOp::F32Div { sd, sn, sm } => encode_vfp_3reg(0xEE800A00, sd, sn, sm)?,
761            ArmOp::F32Abs { sd, sm } => encode_vfp_2reg(0xEEB00AC0, sd, sm)?,
762            ArmOp::F32Neg { sd, sm } => encode_vfp_2reg(0xEEB10A40, sd, sm)?,
763            ArmOp::F32Sqrt { sd, sm } => encode_vfp_2reg(0xEEB10AC0, sd, sm)?,
764
765            // f32 pseudo-ops — multi-instruction sequences
766            // FPSCR RMode: 00=nearest, 01=+inf(ceil), 10=-inf(floor), 11=zero(trunc)
767            ArmOp::F32Ceil { sd, sm } => {
768                return self.encode_arm_f32_rounding(sd, sm, 0b01); // Round toward +Inf
769            }
770            ArmOp::F32Floor { sd, sm } => {
771                return self.encode_arm_f32_rounding(sd, sm, 0b10); // Round toward -Inf
772            }
773            ArmOp::F32Trunc { sd, sm } => {
774                return self.encode_arm_f32_rounding(sd, sm, 0b11); // VCVT toward zero
775            }
776            ArmOp::F32Nearest { sd, sm } => {
777                return self.encode_arm_f32_rounding(sd, sm, 0b00); // VCVT to nearest
778            }
779            ArmOp::F32Min { sd, sn, sm } => {
780                return self.encode_arm_f32_minmax(sd, sn, sm, true);
781            }
782            ArmOp::F32Max { sd, sn, sm } => {
783                return self.encode_arm_f32_minmax(sd, sn, sm, false);
784            }
785            ArmOp::F32Copysign { sd, sn, sm } => {
786                return self.encode_arm_f32_copysign(sd, sn, sm);
787            }
788
789            // f32 comparisons — multi-instruction: VCMP + VMRS + conditional MOV
790            ArmOp::F32Eq { rd, sn, sm } => {
791                return self.encode_arm_f32_compare(rd, sn, sm, 0x0); // EQ
792            }
793            ArmOp::F32Ne { rd, sn, sm } => {
794                return self.encode_arm_f32_compare(rd, sn, sm, 0x1); // NE
795            }
796            ArmOp::F32Lt { rd, sn, sm } => {
797                return self.encode_arm_f32_compare(rd, sn, sm, 0x4); // MI (less than)
798            }
799            ArmOp::F32Le { rd, sn, sm } => {
800                return self.encode_arm_f32_compare(rd, sn, sm, 0x9); // LS (less or same)
801            }
802            ArmOp::F32Gt { rd, sn, sm } => {
803                return self.encode_arm_f32_compare(rd, sn, sm, 0xC); // GT
804            }
805            ArmOp::F32Ge { rd, sn, sm } => {
806                return self.encode_arm_f32_compare(rd, sn, sm, 0xA); // GE
807            }
808
809            // f32 const — multi-instruction: MOVW + MOVT + VMOV
810            ArmOp::F32Const { sd, value } => {
811                return self.encode_arm_f32_const(sd, *value);
812            }
813
814            ArmOp::F32Load { sd, addr } => encode_vfp_ldst(0xED900A00, sd, addr)?,
815            ArmOp::F32Store { sd, addr } => encode_vfp_ldst(0xED800A00, sd, addr)?,
816
817            // f32 conversions — multi-instruction sequences
818            ArmOp::F32ConvertI32S { sd, rm } => {
819                return self.encode_arm_f32_convert_i32(sd, rm, true);
820            }
821            ArmOp::F32ConvertI32U { sd, rm } => {
822                return self.encode_arm_f32_convert_i32(sd, rm, false);
823            }
824            ArmOp::F32ConvertI64S { .. } | ArmOp::F32ConvertI64U { .. } => {
825                return Err(synth_core::Error::synthesis(
826                    "F32 i64 conversion not supported (requires register pairs on 32-bit ARM)",
827                ));
828            }
829            ArmOp::F32ReinterpretI32 { sd, rm } => encode_vmov_core_sreg(true, sd, rm)?,
830            ArmOp::I32ReinterpretF32 { rd, sm } => encode_vmov_core_sreg(false, sm, rd)?,
831            ArmOp::I32TruncF32S { rd, sm } => {
832                return self.encode_arm_i32_trunc_f32(rd, sm, true);
833            }
834            ArmOp::I32TruncF32U { rd, sm } => {
835                return self.encode_arm_i32_trunc_f32(rd, sm, false);
836            }
837
838            // f64 VFP double-precision instructions (ARM32)
839            // F64 arithmetic: same as F32 but with sz=1 (bit 8 = 1, cp11 = 0xB)
840            ArmOp::F64Add { dd, dn, dm } => encode_vfp_3reg_f64(0xEE300B00, dd, dn, dm)?,
841            ArmOp::F64Sub { dd, dn, dm } => encode_vfp_3reg_f64(0xEE300B40, dd, dn, dm)?,
842            ArmOp::F64Mul { dd, dn, dm } => encode_vfp_3reg_f64(0xEE200B00, dd, dn, dm)?,
843            ArmOp::F64Div { dd, dn, dm } => encode_vfp_3reg_f64(0xEE800B00, dd, dn, dm)?,
844            ArmOp::F64Abs { dd, dm } => encode_vfp_2reg_f64(0xEEB00BC0, dd, dm)?,
845            ArmOp::F64Neg { dd, dm } => encode_vfp_2reg_f64(0xEEB10B40, dd, dm)?,
846            ArmOp::F64Sqrt { dd, dm } => encode_vfp_2reg_f64(0xEEB10BC0, dd, dm)?,
847
848            // f64 pseudo-ops
849            // FPSCR RMode: 00=nearest, 01=+inf(ceil), 10=-inf(floor), 11=zero(trunc)
850            ArmOp::F64Ceil { dd, dm } => {
851                return self.encode_arm_f64_rounding(dd, dm, 0b01);
852            }
853            ArmOp::F64Floor { dd, dm } => {
854                return self.encode_arm_f64_rounding(dd, dm, 0b10);
855            }
856            ArmOp::F64Trunc { dd, dm } => {
857                return self.encode_arm_f64_rounding(dd, dm, 0b11);
858            }
859            ArmOp::F64Nearest { dd, dm } => {
860                return self.encode_arm_f64_rounding(dd, dm, 0b00);
861            }
862            ArmOp::F64Min { dd, dn, dm } => {
863                return self.encode_arm_f64_minmax(dd, dn, dm, true);
864            }
865            ArmOp::F64Max { dd, dn, dm } => {
866                return self.encode_arm_f64_minmax(dd, dn, dm, false);
867            }
868            ArmOp::F64Copysign { dd, dn, dm } => {
869                return self.encode_arm_f64_copysign(dd, dn, dm);
870            }
871
872            // f64 comparisons
873            ArmOp::F64Eq { rd, dn, dm } => {
874                return self.encode_arm_f64_compare(rd, dn, dm, 0x0);
875            }
876            ArmOp::F64Ne { rd, dn, dm } => {
877                return self.encode_arm_f64_compare(rd, dn, dm, 0x1);
878            }
879            ArmOp::F64Lt { rd, dn, dm } => {
880                return self.encode_arm_f64_compare(rd, dn, dm, 0x4);
881            }
882            ArmOp::F64Le { rd, dn, dm } => {
883                return self.encode_arm_f64_compare(rd, dn, dm, 0x9);
884            }
885            ArmOp::F64Gt { rd, dn, dm } => {
886                return self.encode_arm_f64_compare(rd, dn, dm, 0xC);
887            }
888            ArmOp::F64Ge { rd, dn, dm } => {
889                return self.encode_arm_f64_compare(rd, dn, dm, 0xA);
890            }
891
892            ArmOp::F64Const { dd, value } => {
893                return self.encode_arm_f64_const(dd, *value);
894            }
895
896            ArmOp::F64Load { dd, addr } => encode_vfp_ldst_f64(0xED900B00, dd, addr)?,
897            ArmOp::F64Store { dd, addr } => encode_vfp_ldst_f64(0xED800B00, dd, addr)?,
898
899            ArmOp::F64ConvertI32S { dd, rm } => {
900                return self.encode_arm_f64_convert_i32(dd, rm, true);
901            }
902            ArmOp::F64ConvertI32U { dd, rm } => {
903                return self.encode_arm_f64_convert_i32(dd, rm, false);
904            }
905            ArmOp::F64ConvertI64S { .. } | ArmOp::F64ConvertI64U { .. } => {
906                return Err(synth_core::Error::synthesis(
907                    "F64 i64 conversion not supported (requires register pairs on 32-bit ARM)",
908                ));
909            }
910            ArmOp::F64PromoteF32 { dd, sm } => {
911                return self.encode_arm_f64_promote_f32(dd, sm);
912            }
913            ArmOp::F64ReinterpretI64 { dd, rmlo, rmhi } => {
914                encode_vmov_core_dreg(true, dd, rmlo, rmhi)?
915            }
916            ArmOp::I64ReinterpretF64 { rdlo, rdhi, dm } => {
917                encode_vmov_core_dreg(false, dm, rdlo, rdhi)?
918            }
919            ArmOp::I64TruncF64S { .. } | ArmOp::I64TruncF64U { .. } => {
920                return Err(synth_core::Error::synthesis(
921                    "i64 truncation from F64 not supported (requires i64 register pairs on 32-bit ARM)",
922                ));
923            }
924            ArmOp::I32TruncF64S { rd, dm } => {
925                return self.encode_arm_i32_trunc_f64(rd, dm, true);
926            }
927            ArmOp::I32TruncF64U { rd, dm } => {
928                return self.encode_arm_i32_trunc_f64(rd, dm, false);
929            }
930            // Multi-instruction sequences - only meaningful in Thumb-2 mode
931            ArmOp::I64SetCond { .. }
932            | ArmOp::I64SetCondZ { .. }
933            | ArmOp::I64Mul { .. }
934            | ArmOp::I64Shl { .. }
935            | ArmOp::I64ShrS { .. }
936            | ArmOp::I64ShrU { .. }
937            | ArmOp::I64Rotl { .. }
938            | ArmOp::I64Rotr { .. } => 0xE1A00000, // NOP (Thumb-2 only)
939
940            // MVE instructions — Thumb-2 only (Cortex-M55 is always Thumb-2)
941            ArmOp::MveLoad { .. }
942            | ArmOp::MveStore { .. }
943            | ArmOp::MveConst { .. }
944            | ArmOp::MveAnd { .. }
945            | ArmOp::MveOrr { .. }
946            | ArmOp::MveEor { .. }
947            | ArmOp::MveMvn { .. }
948            | ArmOp::MveBic { .. }
949            | ArmOp::MveAddI { .. }
950            | ArmOp::MveSubI { .. }
951            | ArmOp::MveMulI { .. }
952            | ArmOp::MveNegI { .. }
953            | ArmOp::MveCmpEqI { .. }
954            | ArmOp::MveCmpNeI { .. }
955            | ArmOp::MveCmpLtS { .. }
956            | ArmOp::MveCmpLtU { .. }
957            | ArmOp::MveCmpGtS { .. }
958            | ArmOp::MveCmpGtU { .. }
959            | ArmOp::MveCmpLeS { .. }
960            | ArmOp::MveCmpLeU { .. }
961            | ArmOp::MveCmpGeS { .. }
962            | ArmOp::MveCmpGeU { .. }
963            | ArmOp::MveDup { .. }
964            | ArmOp::MveExtractLane { .. }
965            | ArmOp::MveInsertLane { .. }
966            | ArmOp::MveAddF32 { .. }
967            | ArmOp::MveSubF32 { .. }
968            | ArmOp::MveMulF32 { .. }
969            | ArmOp::MveNegF32 { .. }
970            | ArmOp::MveAbsF32 { .. }
971            | ArmOp::MveCmpEqF32 { .. }
972            | ArmOp::MveCmpNeF32 { .. }
973            | ArmOp::MveCmpLtF32 { .. }
974            | ArmOp::MveCmpLeF32 { .. }
975            | ArmOp::MveCmpGtF32 { .. }
976            | ArmOp::MveCmpGeF32 { .. }
977            | ArmOp::MveDupF32 { .. }
978            | ArmOp::MveExtractLaneF32 { .. }
979            | ArmOp::MveReplaceLaneF32 { .. }
980            | ArmOp::MveDivF32 { .. }
981            | ArmOp::MveSqrtF32 { .. } => 0xE1A00000, // NOP (MVE = Thumb-2 only)
982        };
983
984        // ARM32 instructions are little-endian
985        Ok(instr.to_le_bytes().to_vec())
986    }
987
988    // === ARM32 VFP multi-instruction helpers ===
989
990    /// Encode F32 comparison as ARM32: VCMP.F32 + VMRS + MOV rd,#0 + MOVcond rd,#1
991    fn encode_arm_f32_compare(
992        &self,
993        rd: &Reg,
994        sn: &VfpReg,
995        sm: &VfpReg,
996        cond_code: u32,
997    ) -> Result<Vec<u8>> {
998        let mut bytes = Vec::new();
999
1000        // VCMP.F32 Sn, Sm: 0xEEB40A40 with Sn in Vd position, Sm in Vm position
1001        let sn_num = vfp_sreg_to_num(sn)?;
1002        let sm_num = vfp_sreg_to_num(sm)?;
1003        let (vd, d) = encode_sreg(sn_num);
1004        let (vm, m) = encode_sreg(sm_num);
1005        let vcmp = 0xEEB40A40 | (d << 22) | (vd << 12) | (m << 5) | vm;
1006        bytes.extend_from_slice(&vcmp.to_le_bytes());
1007
1008        // VMRS APSR_nzcv, FPSCR: 0xEEF1FA10
1009        bytes.extend_from_slice(&0xEEF1FA10u32.to_le_bytes());
1010
1011        // MOV rd, #0: 0xE3A0_0000 | (rd << 12)
1012        let rd_bits = reg_to_bits(rd);
1013        let mov_zero = 0xE3A00000 | (rd_bits << 12);
1014        bytes.extend_from_slice(&mov_zero.to_le_bytes());
1015
1016        // MOVcond rd, #1: cond(4) | 0011 1010 0000 rd(4) 0000 0000 0001
1017        let mov_one = (cond_code << 28) | 0x03A00001 | (rd_bits << 12);
1018        bytes.extend_from_slice(&mov_one.to_le_bytes());
1019
1020        Ok(bytes)
1021    }
1022
1023    /// Encode F32 constant load as ARM32: MOVW Rt,#lo16 + MOVT Rt,#hi16 + VMOV Sd,Rt
1024    fn encode_arm_f32_const(&self, sd: &VfpReg, value: f32) -> Result<Vec<u8>> {
1025        let mut bytes = Vec::new();
1026        let bits = value.to_bits();
1027
1028        // Use R12 as temp register for constant loading
1029        let rt: u32 = 12; // R12/IP
1030
1031        // MOVW R12, #lo16: 0xE300_C000 | (imm4 << 16) | imm12
1032        let lo16 = bits & 0xFFFF;
1033        let movw = 0xE3000000 | (rt << 12) | ((lo16 >> 12) << 16) | (lo16 & 0xFFF);
1034        bytes.extend_from_slice(&movw.to_le_bytes());
1035
1036        // MOVT R12, #hi16: 0xE340_C000 | (imm4 << 16) | imm12
1037        let hi16 = (bits >> 16) & 0xFFFF;
1038        let movt = 0xE3400000 | (rt << 12) | ((hi16 >> 12) << 16) | (hi16 & 0xFFF);
1039        bytes.extend_from_slice(&movt.to_le_bytes());
1040
1041        // VMOV Sd, R12
1042        let vmov = encode_vmov_core_sreg(true, sd, &Reg::R12)?;
1043        bytes.extend_from_slice(&vmov.to_le_bytes());
1044
1045        Ok(bytes)
1046    }
1047
1048    /// Encode VMOV + VCVT.F32.S32/U32 as ARM32
1049    fn encode_arm_f32_convert_i32(&self, sd: &VfpReg, rm: &Reg, signed: bool) -> Result<Vec<u8>> {
1050        let mut bytes = Vec::new();
1051
1052        // VMOV Sd, Rm — move integer to VFP register
1053        let vmov = encode_vmov_core_sreg(true, sd, rm)?;
1054        bytes.extend_from_slice(&vmov.to_le_bytes());
1055
1056        // VCVT.F32.S32 Sd, Sd (signed) or VCVT.F32.U32 Sd, Sd (unsigned)
1057        // Base: 0xEEB80A40 (signed) or 0xEEB80AC0 (unsigned)
1058        let sd_num = vfp_sreg_to_num(sd)?;
1059        let (vd, d) = encode_sreg(sd_num);
1060        let (vm, m) = encode_sreg(sd_num); // same register as source
1061        let base = if signed { 0xEEB80A40 } else { 0xEEB80AC0 };
1062        let vcvt = base | (d << 22) | (vd << 12) | (m << 5) | vm;
1063        bytes.extend_from_slice(&vcvt.to_le_bytes());
1064
1065        Ok(bytes)
1066    }
1067
1068    /// Encode F32 rounding pseudo-op as ARM32 via VCVT to integer and back.
1069    /// mode: 0b00=nearest, 0b01=floor(-Inf), 0b10=ceil(+Inf), 0b11=trunc(zero)
1070    /// Strategy: VCVT.S32.F32 Sd, Sm (toward zero), then VCVT.F32.S32 Sd, Sd
1071    /// For ceil/floor/nearest, we use VCVTR (round toward mode) + convert back.
1072    /// Simplified: convert to int (toward zero for trunc) then back to float.
1073    /// Encode F32 rounding as ARM32.
1074    /// `mode`: FPSCR RMode — 0b00=nearest, 0b01=+inf(ceil), 0b10=-inf(floor), 0b11=zero(trunc)
1075    ///
1076    /// For trunc (mode=0b11): uses VCVTR.S32.F32 (always rounds toward zero).
1077    /// For ceil/floor/nearest: sets FPSCR rounding mode, uses VCVT.S32.F32 (non-R variant
1078    /// which honours FPSCR rmode), then restores FPSCR.
1079    fn encode_arm_f32_rounding(&self, sd: &VfpReg, sm: &VfpReg, mode: u8) -> Result<Vec<u8>> {
1080        let mut bytes = Vec::new();
1081        let sm_num = vfp_sreg_to_num(sm)?;
1082        let sd_num = vfp_sreg_to_num(sd)?;
1083        let (vd_s, d_s) = encode_sreg(sd_num);
1084        let (vm_s, m_s) = encode_sreg(sm_num);
1085
1086        if mode == 0b11 {
1087            // Trunc (toward zero): VCVTR.S32.F32 — the "R" variant always truncates.
1088            // 0xEEBD0AC0: bit[7]=1 => round toward zero regardless of FPSCR
1089            let vcvt_to_int = 0xEEBD0AC0 | (d_s << 22) | (vd_s << 12) | (m_s << 5) | vm_s;
1090            bytes.extend_from_slice(&vcvt_to_int.to_le_bytes());
1091        } else {
1092            // ceil/floor/nearest: manipulate FPSCR rounding mode
1093            let rt: u32 = 12; // R12/IP as temp
1094
1095            // VMRS R12, FPSCR
1096            let vmrs = 0xEEF10A10 | (rt << 12);
1097            bytes.extend_from_slice(&vmrs.to_le_bytes());
1098
1099            // BIC R12, R12, #(3 << 22) — clear RMode bits [23:22]
1100            // 3<<22 = 0x00C00000. ARM rotated imm: 0x03 ror 10 (rotation=5, imm8=0x03)
1101            let bic = 0xE3CC0000 | (rt << 12) | (0x05 << 8) | 0x03;
1102            bytes.extend_from_slice(&bic.to_le_bytes());
1103
1104            // ORR R12, R12, #(mode << 22) — set desired rounding mode
1105            if mode != 0 {
1106                // mode<<22: rotation=5, imm8=mode
1107                let orr = 0xE38C0000 | (rt << 12) | (0x05 << 8) | (mode as u32);
1108                bytes.extend_from_slice(&orr.to_le_bytes());
1109            }
1110
1111            // VMSR FPSCR, R12
1112            let vmsr = 0xEEE10A10 | (rt << 12);
1113            bytes.extend_from_slice(&vmsr.to_le_bytes());
1114
1115            // VCVT.S32.F32 Sd, Sm — non-R variant (bit[7]=0), uses FPSCR rounding mode
1116            let vcvt_to_int = 0xEEBD0A40 | (d_s << 22) | (vd_s << 12) | (m_s << 5) | vm_s;
1117            bytes.extend_from_slice(&vcvt_to_int.to_le_bytes());
1118
1119            // Restore FPSCR: clear rmode bits back to nearest (default)
1120            bytes.extend_from_slice(&vmrs.to_le_bytes());
1121            bytes.extend_from_slice(&bic.to_le_bytes());
1122            bytes.extend_from_slice(&vmsr.to_le_bytes());
1123        }
1124
1125        // VCVT.F32.S32 Sd, Sd (convert integer result back to float)
1126        let (vd2, d2) = encode_sreg(sd_num);
1127        let vcvt_to_float = 0xEEB80A40 | (d2 << 22) | (vd2 << 12) | (d_s << 5) | vd_s;
1128        bytes.extend_from_slice(&vcvt_to_float.to_le_bytes());
1129
1130        Ok(bytes)
1131    }
1132
1133    /// Encode F32 min/max as ARM32: VCMP + VMRS + conditional VMOV
1134    fn encode_arm_f32_minmax(
1135        &self,
1136        sd: &VfpReg,
1137        sn: &VfpReg,
1138        sm: &VfpReg,
1139        is_min: bool,
1140    ) -> Result<Vec<u8>> {
1141        let mut bytes = Vec::new();
1142        let sn_num = vfp_sreg_to_num(sn)?;
1143        let sm_num = vfp_sreg_to_num(sm)?;
1144        let sd_num = vfp_sreg_to_num(sd)?;
1145
1146        // VMOV Sd, Sn (start with first operand)
1147        let (vd, d) = encode_sreg(sd_num);
1148        let (vn, n) = encode_sreg(sn_num);
1149        let vmov_sn = 0xEEB00A40 | (d << 22) | (vd << 12) | (n << 5) | vn;
1150        bytes.extend_from_slice(&vmov_sn.to_le_bytes());
1151
1152        // VCMP.F32 Sn, Sm
1153        let (vm, m) = encode_sreg(sm_num);
1154        let vcmp = 0xEEB40A40 | (n << 22) | (vn << 12) | (m << 5) | vm;
1155        bytes.extend_from_slice(&vcmp.to_le_bytes());
1156
1157        // VMRS APSR_nzcv, FPSCR
1158        bytes.extend_from_slice(&0xEEF1FA10u32.to_le_bytes());
1159
1160        // For min: if Sn > Sm (GT), use Sm. Condition = GT (0xC)
1161        // For max: if Sn < Sm (MI/LT), use Sm. Condition = MI (0x4)
1162        let cond = if is_min { 0xCu32 } else { 0x4u32 };
1163
1164        // VMOV{cond} Sd, Sm — conditional VMOV
1165        let vmov_cond = (cond << 28) | 0x0EB00A40 | (d << 22) | (vd << 12) | (m << 5) | vm;
1166        bytes.extend_from_slice(&vmov_cond.to_le_bytes());
1167
1168        Ok(bytes)
1169    }
1170
1171    /// Encode F32 copysign as ARM32: extract sign from Sm, magnitude from Sn
1172    fn encode_arm_f32_copysign(&self, sd: &VfpReg, sn: &VfpReg, sm: &VfpReg) -> Result<Vec<u8>> {
1173        let mut bytes = Vec::new();
1174
1175        // VMOV R12, Sm (get sign source bits)
1176        let vmov_sm = encode_vmov_core_sreg(false, sm, &Reg::R12)?;
1177        bytes.extend_from_slice(&vmov_sm.to_le_bytes());
1178
1179        // VMOV R0, Sn (get magnitude source bits) — use R0 as temp
1180        let vmov_sn = encode_vmov_core_sreg(false, sn, &Reg::R0)?;
1181        bytes.extend_from_slice(&vmov_sn.to_le_bytes());
1182
1183        // AND R12, R12, #0x80000000 (keep only sign bit)
1184        // Thumb-2 constant 0x80000000 needs special encoding; in ARM32 use rotated imm
1185        // 0x80000000 = 0x02 rotated right by 2 (rotation=1, imm8=0x02)
1186        let and_sign = 0xE2000000u32 | (12 << 16) | (12 << 12) | (1 << 8) | 0x02;
1187        bytes.extend_from_slice(&and_sign.to_le_bytes());
1188
1189        // BIC R0, R0, #0x80000000 (clear sign bit from magnitude)
1190        // R0 = register 0, so Rn and Rd fields are 0
1191        let bic_sign = 0xE3C00000u32 | (1 << 8) | 0x02;
1192        bytes.extend_from_slice(&bic_sign.to_le_bytes());
1193
1194        // ORR R0, R0, R12 (combine sign + magnitude)
1195        // R0 = register 0, so Rn and Rd fields are 0
1196        let orr = 0xE1800000u32 | 12;
1197        bytes.extend_from_slice(&orr.to_le_bytes());
1198
1199        // VMOV Sd, R0
1200        let vmov_result = encode_vmov_core_sreg(true, sd, &Reg::R0)?;
1201        bytes.extend_from_slice(&vmov_result.to_le_bytes());
1202
1203        Ok(bytes)
1204    }
1205
1206    /// Encode F64 comparison as ARM32: VCMP.F64 + VMRS + MOV rd,#0 + MOVcond rd,#1
1207    fn encode_arm_f64_compare(
1208        &self,
1209        rd: &Reg,
1210        dn: &VfpReg,
1211        dm: &VfpReg,
1212        cond_code: u32,
1213    ) -> Result<Vec<u8>> {
1214        let mut bytes = Vec::new();
1215
1216        // VCMP.F64 Dn, Dm: 0xEEB40B40 with Dn in Vd position, Dm in Vm position
1217        let dn_num = vfp_dreg_to_num(dn)?;
1218        let dm_num = vfp_dreg_to_num(dm)?;
1219        let (vd, d) = encode_dreg(dn_num);
1220        let (vm, m) = encode_dreg(dm_num);
1221        let vcmp = 0xEEB40B40 | (d << 22) | (vd << 12) | (m << 5) | vm;
1222        bytes.extend_from_slice(&vcmp.to_le_bytes());
1223
1224        // VMRS APSR_nzcv, FPSCR
1225        bytes.extend_from_slice(&0xEEF1FA10u32.to_le_bytes());
1226
1227        // MOV rd, #0
1228        let rd_bits = reg_to_bits(rd);
1229        let mov_zero = 0xE3A00000 | (rd_bits << 12);
1230        bytes.extend_from_slice(&mov_zero.to_le_bytes());
1231
1232        // MOVcond rd, #1
1233        let mov_one = (cond_code << 28) | 0x03A00001 | (rd_bits << 12);
1234        bytes.extend_from_slice(&mov_one.to_le_bytes());
1235
1236        Ok(bytes)
1237    }
1238
1239    /// Encode F64 constant load as ARM32: MOVW + MOVT + MOVW + MOVT + VMOV
1240    fn encode_arm_f64_const(&self, dd: &VfpReg, value: f64) -> Result<Vec<u8>> {
1241        let mut bytes = Vec::new();
1242        let bits = value.to_bits();
1243        let lo32 = bits as u32;
1244        let hi32 = (bits >> 32) as u32;
1245
1246        // Load low 32 bits into R0 (Rd field = 0 for R0)
1247        let lo16 = lo32 & 0xFFFF;
1248        let movw_r0 = 0xE3000000 | ((lo16 >> 12) << 16) | (lo16 & 0xFFF);
1249        bytes.extend_from_slice(&movw_r0.to_le_bytes());
1250        let hi16 = (lo32 >> 16) & 0xFFFF;
1251        let movt_r0 = 0xE3400000 | ((hi16 >> 12) << 16) | (hi16 & 0xFFF);
1252        bytes.extend_from_slice(&movt_r0.to_le_bytes());
1253
1254        // Load high 32 bits into R12
1255        let lo16 = hi32 & 0xFFFF;
1256        let movw_r12 = 0xE3000000 | ((lo16 >> 12) << 16) | (12 << 12) | (lo16 & 0xFFF);
1257        bytes.extend_from_slice(&movw_r12.to_le_bytes());
1258        let hi16 = (hi32 >> 16) & 0xFFFF;
1259        let movt_r12 = 0xE3400000 | ((hi16 >> 12) << 16) | (12 << 12) | (hi16 & 0xFFF);
1260        bytes.extend_from_slice(&movt_r12.to_le_bytes());
1261
1262        // VMOV Dd, R0, R12
1263        let vmov = encode_vmov_core_dreg(true, dd, &Reg::R0, &Reg::R12)?;
1264        bytes.extend_from_slice(&vmov.to_le_bytes());
1265
1266        Ok(bytes)
1267    }
1268
1269    /// Encode VMOV Sd, Rm + VCVT.F64.S32/U32 Dd, Sd as ARM32
1270    fn encode_arm_f64_convert_i32(&self, dd: &VfpReg, rm: &Reg, signed: bool) -> Result<Vec<u8>> {
1271        let mut bytes = Vec::new();
1272
1273        // Use S0 as intermediate: VMOV S0, Rm
1274        let vmov = encode_vmov_core_sreg(true, &VfpReg::S0, rm)?;
1275        bytes.extend_from_slice(&vmov.to_le_bytes());
1276
1277        // VCVT.F64.S32 Dd, S0 (signed) or VCVT.F64.U32 Dd, S0 (unsigned)
1278        // Base: 0xEEB80B40 (signed) or 0xEEB80BC0 (unsigned)
1279        let dd_num = vfp_dreg_to_num(dd)?;
1280        let (vd, d) = encode_dreg(dd_num);
1281        let base = if signed { 0xEEB80B40 } else { 0xEEB80BC0 };
1282        // S0 is register 0: Vm=0, M=0
1283        let vcvt = base | (d << 22) | (vd << 12);
1284        bytes.extend_from_slice(&vcvt.to_le_bytes());
1285
1286        Ok(bytes)
1287    }
1288
1289    /// Encode VCVT.F64.F32 Dd, Sm as ARM32 (f32 to f64 promotion)
1290    fn encode_arm_f64_promote_f32(&self, dd: &VfpReg, sm: &VfpReg) -> Result<Vec<u8>> {
1291        let dd_num = vfp_dreg_to_num(dd)?;
1292        let sm_num = vfp_sreg_to_num(sm)?;
1293        let (vd, d) = encode_dreg(dd_num);
1294        let (vm, m) = encode_sreg(sm_num);
1295
1296        // VCVT.F64.F32 Dd, Sm: 0xEEB70AC0
1297        let vcvt = 0xEEB70AC0 | (d << 22) | (vd << 12) | (m << 5) | vm;
1298        Ok(vcvt.to_le_bytes().to_vec())
1299    }
1300
1301    /// Encode VCVT.S32/U32.F64 Sd, Dm + VMOV Rd, Sd as ARM32
1302    fn encode_arm_i32_trunc_f64(&self, rd: &Reg, dm: &VfpReg, signed: bool) -> Result<Vec<u8>> {
1303        let mut bytes = Vec::new();
1304        let dm_num = vfp_dreg_to_num(dm)?;
1305        let (vm, m) = encode_dreg(dm_num);
1306
1307        // VCVT.S32.F64 S0, Dm (toward zero) or VCVT.U32.F64 S0, Dm
1308        // S0: Vd=0, D=0
1309        let base = if signed { 0xEEBD0BC0 } else { 0xEEBC0BC0 };
1310        let vcvt = base | (m << 5) | vm;
1311        bytes.extend_from_slice(&vcvt.to_le_bytes());
1312
1313        // VMOV Rd, S0
1314        let vmov = encode_vmov_core_sreg(false, &VfpReg::S0, rd)?;
1315        bytes.extend_from_slice(&vmov.to_le_bytes());
1316
1317        Ok(bytes)
1318    }
1319
1320    /// Encode F64 rounding pseudo-op as ARM32 via VCVT to integer and back.
1321    /// Encode F64 rounding as ARM32.
1322    /// `mode`: FPSCR RMode — 0b00=nearest, 0b01=+inf(ceil), 0b10=-inf(floor), 0b11=zero(trunc)
1323    ///
1324    /// For trunc: uses VCVTR.S32.F64 (always truncates).
1325    /// For ceil/floor/nearest: sets FPSCR rounding mode, uses VCVT.S32.F64 (non-R variant),
1326    /// then restores FPSCR.
1327    fn encode_arm_f64_rounding(&self, dd: &VfpReg, dm: &VfpReg, mode: u8) -> Result<Vec<u8>> {
1328        let mut bytes = Vec::new();
1329        let dm_num = vfp_dreg_to_num(dm)?;
1330        let dd_num = vfp_dreg_to_num(dd)?;
1331        let (vm, m) = encode_dreg(dm_num);
1332        let (vd, d) = encode_dreg(dd_num);
1333
1334        if mode == 0b11 {
1335            // Trunc (toward zero): VCVTR.S32.F64 — bit[7]=1, always truncates
1336            let vcvt_to_int = 0xEEBD0BC0 | (m << 5) | vm;
1337            bytes.extend_from_slice(&vcvt_to_int.to_le_bytes());
1338        } else {
1339            // ceil/floor/nearest: manipulate FPSCR rounding mode
1340            let rt: u32 = 12;
1341
1342            // VMRS R12, FPSCR
1343            let vmrs = 0xEEF10A10 | (rt << 12);
1344            bytes.extend_from_slice(&vmrs.to_le_bytes());
1345
1346            // BIC R12, R12, #(3 << 22)
1347            let bic = 0xE3CC0000 | (rt << 12) | (0x05 << 8) | 0x03;
1348            bytes.extend_from_slice(&bic.to_le_bytes());
1349
1350            // ORR R12, R12, #(mode << 22)
1351            if mode != 0 {
1352                let orr = 0xE38C0000 | (rt << 12) | (0x05 << 8) | (mode as u32);
1353                bytes.extend_from_slice(&orr.to_le_bytes());
1354            }
1355
1356            // VMSR FPSCR, R12
1357            let vmsr = 0xEEE10A10 | (rt << 12);
1358            bytes.extend_from_slice(&vmsr.to_le_bytes());
1359
1360            // VCVT.S32.F64 S0, Dm — non-R variant (bit[7]=0), uses FPSCR rmode
1361            let vcvt_to_int = 0xEEBD0B40 | (m << 5) | vm;
1362            bytes.extend_from_slice(&vcvt_to_int.to_le_bytes());
1363
1364            // Restore FPSCR
1365            bytes.extend_from_slice(&vmrs.to_le_bytes());
1366            bytes.extend_from_slice(&bic.to_le_bytes());
1367            bytes.extend_from_slice(&vmsr.to_le_bytes());
1368        }
1369
1370        // VCVT.F64.S32 Dd, S0 (convert back to double)
1371        let vcvt_to_float = 0xEEB80B40 | (d << 22) | (vd << 12);
1372        bytes.extend_from_slice(&vcvt_to_float.to_le_bytes());
1373
1374        Ok(bytes)
1375    }
1376
1377    /// Encode F64 min/max as ARM32: VMOV + VCMP + VMRS + conditional VMOV
1378    fn encode_arm_f64_minmax(
1379        &self,
1380        dd: &VfpReg,
1381        dn: &VfpReg,
1382        dm: &VfpReg,
1383        is_min: bool,
1384    ) -> Result<Vec<u8>> {
1385        let mut bytes = Vec::new();
1386        let dn_num = vfp_dreg_to_num(dn)?;
1387        let dm_num = vfp_dreg_to_num(dm)?;
1388        let dd_num = vfp_dreg_to_num(dd)?;
1389
1390        // VMOV.F64 Dd, Dn (start with first operand)
1391        let (vd, d) = encode_dreg(dd_num);
1392        let (vn, n) = encode_dreg(dn_num);
1393        let vmov_dn = 0xEEB00B40 | (d << 22) | (vd << 12) | (n << 5) | vn;
1394        bytes.extend_from_slice(&vmov_dn.to_le_bytes());
1395
1396        // VCMP.F64 Dn, Dm
1397        let (vm, m) = encode_dreg(dm_num);
1398        let vcmp = 0xEEB40B40 | (n << 22) | (vn << 12) | (m << 5) | vm;
1399        bytes.extend_from_slice(&vcmp.to_le_bytes());
1400
1401        // VMRS APSR_nzcv, FPSCR
1402        bytes.extend_from_slice(&0xEEF1FA10u32.to_le_bytes());
1403
1404        let cond = if is_min { 0xCu32 } else { 0x4u32 };
1405        let vmov_cond = (cond << 28) | 0x0EB00B40 | (d << 22) | (vd << 12) | (m << 5) | vm;
1406        bytes.extend_from_slice(&vmov_cond.to_le_bytes());
1407
1408        Ok(bytes)
1409    }
1410
1411    /// Encode F64 copysign as ARM32
1412    fn encode_arm_f64_copysign(&self, dd: &VfpReg, dn: &VfpReg, dm: &VfpReg) -> Result<Vec<u8>> {
1413        let mut bytes = Vec::new();
1414
1415        // VMOV R0, R12, Dm (get sign source bits)
1416        let vmov_dm = encode_vmov_core_dreg(false, dm, &Reg::R0, &Reg::R12)?;
1417        bytes.extend_from_slice(&vmov_dm.to_le_bytes());
1418
1419        // VMOV R1, R2, Dn (get magnitude source bits)
1420        // We use R1 (lo) and R2 (hi) for the magnitude
1421        let vmov_dn = encode_vmov_core_dreg(false, dn, &Reg::R1, &Reg::R2)?;
1422        bytes.extend_from_slice(&vmov_dn.to_le_bytes());
1423
1424        // AND R12, R12, #0x80000000 (keep only sign bit from hi word)
1425        let and_sign = 0xE2000000u32 | (12 << 16) | (12 << 12) | (1 << 8) | 0x02;
1426        bytes.extend_from_slice(&and_sign.to_le_bytes());
1427
1428        // BIC R2, R2, #0x80000000 (clear sign bit from magnitude hi word)
1429        let bic_sign = 0xE3C00000u32 | (2 << 16) | (2 << 12) | (1 << 8) | 0x02;
1430        bytes.extend_from_slice(&bic_sign.to_le_bytes());
1431
1432        // ORR R2, R2, R12 (combine sign + magnitude)
1433        let orr = 0xE1800000u32 | (2 << 16) | (2 << 12) | 12;
1434        bytes.extend_from_slice(&orr.to_le_bytes());
1435
1436        // VMOV Dd, R1, R2
1437        let vmov_result = encode_vmov_core_dreg(true, dd, &Reg::R1, &Reg::R2)?;
1438        bytes.extend_from_slice(&vmov_result.to_le_bytes());
1439
1440        Ok(bytes)
1441    }
1442
1443    /// Encode VCVT.S32/U32.F32 + VMOV as ARM32
1444    fn encode_arm_i32_trunc_f32(&self, rd: &Reg, sm: &VfpReg, signed: bool) -> Result<Vec<u8>> {
1445        let mut bytes = Vec::new();
1446
1447        // VCVT.S32.F32 Sd, Sm (toward zero) or VCVT.U32.F32 Sd, Sm
1448        // We use Sm as both source and destination for the intermediate result
1449        let sm_num = vfp_sreg_to_num(sm)?;
1450        let (vd, d) = encode_sreg(sm_num);
1451        let (vm, m) = encode_sreg(sm_num);
1452        let base = if signed { 0xEEBD0AC0 } else { 0xEEBC0AC0 };
1453        let vcvt = base | (d << 22) | (vd << 12) | (m << 5) | vm;
1454        bytes.extend_from_slice(&vcvt.to_le_bytes());
1455
1456        // VMOV Rd, Sm — move result back to core register
1457        let vmov = encode_vmov_core_sreg(false, sm, rd)?;
1458        bytes.extend_from_slice(&vmov.to_le_bytes());
1459
1460        Ok(bytes)
1461    }
1462
1463    /// Encode an ARM instruction in Thumb-2 mode (16-bit or 32-bit instructions)
1464    fn encode_thumb(&self, op: &ArmOp) -> Result<Vec<u8>> {
1465        // Thumb-2 supports both 16-bit and 32-bit instructions
1466        // 32-bit instructions are encoded as two 16-bit halfwords (big-endian order)
1467        match op {
1468            // === 16-bit Thumb encodings ===
1469            ArmOp::Add { rd, rn, op2 } => {
1470                let rd_bits = reg_to_bits(rd) as u16;
1471                let rn_bits = reg_to_bits(rn) as u16;
1472
1473                if let Operand2::Reg(rm) = op2 {
1474                    let rm_bits = reg_to_bits(rm) as u16;
1475                    // 16-bit ADDS only has 3-bit register fields (R0-R7). For
1476                    // high registers (e.g. R12, the MemLoad/MemStore base
1477                    // scratch) the bits overflow into adjacent fields, silently
1478                    // corrupting the operands — issue #178/#180: `add ip,ip,r0`
1479                    // was emitted as `adds r4,r5,r1`. Guard on all three regs
1480                    // being low and fall back to 32-bit ADD.W otherwise, exactly
1481                    // as the Sub handler below does.
1482                    if rd_bits < 8 && rn_bits < 8 && rm_bits < 8 {
1483                        // ADDS Rd, Rn, Rm (16-bit): 0001 100 Rm Rn Rd
1484                        let instr: u16 = 0x1800 | (rm_bits << 6) | (rn_bits << 3) | rd_bits;
1485                        Ok(instr.to_le_bytes().to_vec())
1486                    } else {
1487                        // ADD.W Rd, Rn, Rm (32-bit) for high registers
1488                        self.encode_thumb32_add_reg_raw(
1489                            rd_bits as u32,
1490                            rn_bits as u32,
1491                            rm_bits as u32,
1492                        )
1493                    }
1494                } else if let Operand2::Imm(imm) = op2 {
1495                    if *imm <= 7 && rd_bits < 8 && rn_bits < 8 {
1496                        // ADDS Rd, Rn, #imm3 (16-bit): 0001 110 imm3 Rn Rd
1497                        let instr: u16 = 0x1C00 | ((*imm as u16) << 6) | (rn_bits << 3) | rd_bits;
1498                        Ok(instr.to_le_bytes().to_vec())
1499                    } else {
1500                        // Use 32-bit ADD for larger immediates
1501                        self.encode_thumb32_add(rd, rn, *imm as u32)
1502                    }
1503                } else {
1504                    // Fallback to 32-bit encoding
1505                    self.encode_thumb32_add(rd, rn, 0)
1506                }
1507            }
1508
1509            ArmOp::Sub { rd, rn, op2 } => {
1510                let rd_bits = reg_to_bits(rd) as u16;
1511                let rn_bits = reg_to_bits(rn) as u16;
1512
1513                if let Operand2::Reg(rm) = op2 {
1514                    let rm_bits = reg_to_bits(rm) as u16;
1515                    // 16-bit SUBS can only use low registers (R0-R7)
1516                    if rd_bits < 8 && rn_bits < 8 && rm_bits < 8 {
1517                        // SUBS Rd, Rn, Rm (16-bit): 0001 101 Rm Rn Rd
1518                        let instr: u16 = 0x1A00 | (rm_bits << 6) | (rn_bits << 3) | rd_bits;
1519                        Ok(instr.to_le_bytes().to_vec())
1520                    } else {
1521                        // Use 32-bit SUB.W for high registers
1522                        self.encode_thumb32_sub_reg_raw(
1523                            rd_bits as u32,
1524                            rn_bits as u32,
1525                            rm_bits as u32,
1526                        )
1527                    }
1528                } else if let Operand2::Imm(imm) = op2 {
1529                    if *imm <= 7 && rd_bits < 8 && rn_bits < 8 {
1530                        // SUBS Rd, Rn, #imm3 (16-bit): 0001 111 imm3 Rn Rd
1531                        let instr: u16 = 0x1E00 | ((*imm as u16) << 6) | (rn_bits << 3) | rd_bits;
1532                        Ok(instr.to_le_bytes().to_vec())
1533                    } else {
1534                        self.encode_thumb32_sub(rd, rn, *imm as u32)
1535                    }
1536                } else {
1537                    self.encode_thumb32_sub(rd, rn, 0)
1538                }
1539            }
1540
1541            ArmOp::Mov { rd, op2 } => {
1542                let rd_bits = reg_to_bits(rd) as u16;
1543
1544                if let Operand2::Imm(imm) = op2 {
1545                    if *imm <= 255 && rd_bits < 8 {
1546                        // MOVS Rd, #imm8 (16-bit): 0010 0 Rd imm8
1547                        let imm_bits = (*imm as u16) & 0xFF;
1548                        let instr: u16 = 0x2000 | (rd_bits << 8) | imm_bits;
1549                        Ok(instr.to_le_bytes().to_vec())
1550                    } else {
1551                        // Use 32-bit MOVW for larger immediates
1552                        self.encode_thumb32_movw(rd, *imm as u32)
1553                    }
1554                } else if let Operand2::Reg(rm) = op2 {
1555                    let rm_bits = reg_to_bits(rm) as u16;
1556                    // MOV Rd, Rm (16-bit): 0100 0110 D Rm Rd[2:0]
1557                    // D = Rd[3], Rd[2:0] in lower bits
1558                    let d_bit = (rd_bits >> 3) & 1;
1559                    let instr: u16 = 0x4600 | (d_bit << 7) | (rm_bits << 3) | (rd_bits & 0x7);
1560                    Ok(instr.to_le_bytes().to_vec())
1561                } else {
1562                    let instr: u16 = 0xBF00; // NOP fallback
1563                    Ok(instr.to_le_bytes().to_vec())
1564                }
1565            }
1566
1567            ArmOp::Push { regs } => {
1568                // Thumb-2 PUSH encoding:
1569                // If all regs in R0-R7 + LR, use 16-bit: 1011 010 M rrrrrrrr
1570                // Otherwise use 32-bit: STMDB SP!, {regs} = 1110 1001 0010 1101 | 0M0 reglist(13)
1571                let mut reg_list: u16 = 0;
1572                let mut need_32bit = false;
1573                for r in regs {
1574                    let bit = reg_to_bits(r);
1575                    if bit >= 8 && *r != Reg::LR {
1576                        need_32bit = true;
1577                    }
1578                    reg_list |= 1 << bit;
1579                }
1580                if !need_32bit {
1581                    // 16-bit PUSH: 1011 010 M rrrrrrrr
1582                    let m_bit = if reg_list & (1 << 14) != 0 {
1583                        1u16
1584                    } else {
1585                        0u16
1586                    };
1587                    let low_regs = reg_list & 0xFF;
1588                    let instr: u16 = 0xB400 | (m_bit << 8) | low_regs;
1589                    Ok(instr.to_le_bytes().to_vec())
1590                } else {
1591                    // 32-bit STMDB SP!, {regs}: E92D | reglist(16)
1592                    let hw1: u16 = 0xE92D;
1593                    let hw2: u16 = reg_list;
1594                    let mut bytes = hw1.to_le_bytes().to_vec();
1595                    bytes.extend_from_slice(&hw2.to_le_bytes());
1596                    Ok(bytes)
1597                }
1598            }
1599
1600            ArmOp::Pop { regs } => {
1601                // Thumb-2 POP encoding:
1602                // If all regs in R0-R7 + PC, use 16-bit: 1011 110 P rrrrrrrr
1603                // Otherwise use 32-bit: LDMIA SP!, {regs} = 1110 1000 1011 1101 | PM0 reglist(13)
1604                let mut reg_list: u16 = 0;
1605                let mut need_32bit = false;
1606                for r in regs {
1607                    let bit = reg_to_bits(r);
1608                    if bit >= 8 && *r != Reg::PC {
1609                        need_32bit = true;
1610                    }
1611                    reg_list |= 1 << bit;
1612                }
1613                if !need_32bit {
1614                    // 16-bit POP: 1011 110 P rrrrrrrr
1615                    let p_bit = if reg_list & (1 << 15) != 0 {
1616                        1u16
1617                    } else {
1618                        0u16
1619                    };
1620                    let low_regs = reg_list & 0xFF;
1621                    let instr: u16 = 0xBC00 | (p_bit << 8) | low_regs;
1622                    Ok(instr.to_le_bytes().to_vec())
1623                } else {
1624                    // 32-bit LDMIA SP!, {regs}: E8BD | reglist(16)
1625                    let hw1: u16 = 0xE8BD;
1626                    let hw2: u16 = reg_list;
1627                    let mut bytes = hw1.to_le_bytes().to_vec();
1628                    bytes.extend_from_slice(&hw2.to_le_bytes());
1629                    Ok(bytes)
1630                }
1631            }
1632
1633            ArmOp::Nop => {
1634                let instr: u16 = 0xBF00; // NOP in Thumb-2
1635                Ok(instr.to_le_bytes().to_vec())
1636            }
1637
1638            ArmOp::Udf { imm } => {
1639                // UDF (Undefined) in Thumb-2: 16-bit encoding is 0xDE00 | imm8
1640                // This triggers UsageFault/HardFault, used for WASM traps
1641                let instr: u16 = 0xDE00 | (*imm as u16);
1642                let bytes = instr.to_le_bytes().to_vec();
1643                encoding_contracts::verify_thumb16(&bytes);
1644                Ok(bytes)
1645            }
1646
1647            // i64 support: ADDS, ADC, SUBS, SBC for register pair arithmetic
1648            // ADDS sets flags (carry), ADC uses carry from previous ADDS
1649            ArmOp::Adds { rd, rn, op2 } => {
1650                let rd_bits = reg_to_bits(rd) as u16;
1651                let rn_bits = reg_to_bits(rn) as u16;
1652
1653                if let Operand2::Reg(rm) = op2 {
1654                    let rm_bits = reg_to_bits(rm) as u16;
1655                    // 16-bit ADDS is R0-R7 only; i64 pair allocation can place
1656                    // operands in R8-R11, which would overflow the 3-bit fields
1657                    // and corrupt the operands (#178/#180 class). Guard and fall
1658                    // back to 32-bit ADDS.W for high registers.
1659                    if rd_bits < 8 && rn_bits < 8 && rm_bits < 8 {
1660                        // ADDS Rd, Rn, Rm (16-bit): 0001 100 Rm Rn Rd
1661                        let instr: u16 = 0x1800 | (rm_bits << 6) | (rn_bits << 3) | rd_bits;
1662                        Ok(instr.to_le_bytes().to_vec())
1663                    } else {
1664                        self.encode_thumb32_adds_reg_raw(
1665                            rd_bits as u32,
1666                            rn_bits as u32,
1667                            rm_bits as u32,
1668                        )
1669                    }
1670                } else {
1671                    // 32-bit Thumb-2 ADDS with immediate
1672                    self.encode_thumb32_adds(rd, rn, 0)
1673                }
1674            }
1675
1676            // ADC: Add with Carry (Thumb-2 32-bit)
1677            // ADC.W Rd, Rn, Rm: EB40 Rn | 00 Rd 00 Rm
1678            ArmOp::Adc { rd, rn, op2 } => {
1679                let rd_bits = reg_to_bits(rd);
1680                let rn_bits = reg_to_bits(rn);
1681
1682                if let Operand2::Reg(rm) = op2 {
1683                    let rm_bits = reg_to_bits(rm);
1684                    // ADC.W Rd, Rn, Rm (T2): 1110 1011 0100 Rn | 0 000 Rd 00 00 Rm
1685                    let hw1: u16 = (0xEB40 | rn_bits) as u16;
1686                    let hw2: u16 = ((rd_bits << 8) | rm_bits) as u16;
1687
1688                    let mut bytes = hw1.to_le_bytes().to_vec();
1689                    bytes.extend_from_slice(&hw2.to_le_bytes());
1690                    Ok(bytes)
1691                } else {
1692                    // ADC with immediate - use 32-bit encoding
1693                    let hw1: u16 = (0xF140 | rn_bits) as u16;
1694                    let hw2: u16 = (rd_bits << 8) as u16;
1695                    let mut bytes = hw1.to_le_bytes().to_vec();
1696                    bytes.extend_from_slice(&hw2.to_le_bytes());
1697                    Ok(bytes)
1698                }
1699            }
1700
1701            // SUBS sets flags (borrow), SBC uses borrow from previous SUBS
1702            ArmOp::Subs { rd, rn, op2 } => {
1703                let rd_bits = reg_to_bits(rd) as u16;
1704                let rn_bits = reg_to_bits(rn) as u16;
1705
1706                if let Operand2::Reg(rm) = op2 {
1707                    let rm_bits = reg_to_bits(rm) as u16;
1708                    // 16-bit SUBS is R0-R7 only; high-register i64 pair operands
1709                    // would overflow the 3-bit fields (#178/#180 class). Guard
1710                    // and fall back to 32-bit SUBS.W for high registers.
1711                    if rd_bits < 8 && rn_bits < 8 && rm_bits < 8 {
1712                        // SUBS Rd, Rn, Rm (16-bit): 0001 101 Rm Rn Rd
1713                        let instr: u16 = 0x1A00 | (rm_bits << 6) | (rn_bits << 3) | rd_bits;
1714                        Ok(instr.to_le_bytes().to_vec())
1715                    } else {
1716                        self.encode_thumb32_subs_reg_raw(
1717                            rd_bits as u32,
1718                            rn_bits as u32,
1719                            rm_bits as u32,
1720                        )
1721                    }
1722                } else {
1723                    // 32-bit Thumb-2 SUBS with immediate
1724                    self.encode_thumb32_subs(rd, rn, 0)
1725                }
1726            }
1727
1728            // SBC: Subtract with Carry (Thumb-2 32-bit)
1729            // SBC.W Rd, Rn, Rm: EB60 Rn | 00 Rd 00 Rm
1730            ArmOp::Sbc { rd, rn, op2 } => {
1731                let rd_bits = reg_to_bits(rd);
1732                let rn_bits = reg_to_bits(rn);
1733
1734                if let Operand2::Reg(rm) = op2 {
1735                    let rm_bits = reg_to_bits(rm);
1736                    // SBC.W Rd, Rn, Rm (T2): 1110 1011 0110 Rn | 0 000 Rd 00 00 Rm
1737                    let hw1: u16 = (0xEB60 | rn_bits) as u16;
1738                    let hw2: u16 = ((rd_bits << 8) | rm_bits) as u16;
1739
1740                    let mut bytes = hw1.to_le_bytes().to_vec();
1741                    bytes.extend_from_slice(&hw2.to_le_bytes());
1742                    Ok(bytes)
1743                } else {
1744                    // SBC with immediate - use 32-bit encoding
1745                    let hw1: u16 = (0xF160 | rn_bits) as u16;
1746                    let hw2: u16 = (rd_bits << 8) as u16;
1747                    let mut bytes = hw1.to_le_bytes().to_vec();
1748                    bytes.extend_from_slice(&hw2.to_le_bytes());
1749                    Ok(bytes)
1750                }
1751            }
1752
1753            // === 32-bit Thumb-2 encodings ===
1754
1755            // SDIV: 11111011 1001 Rn 1111 Rd 1111 Rm
1756            ArmOp::Sdiv { rd, rn, rm } => {
1757                let rd_bits = reg_to_bits(rd);
1758                let rn_bits = reg_to_bits(rn);
1759                let rm_bits = reg_to_bits(rm);
1760                reg_bits_checked(rd_bits)?;
1761                reg_bits_checked(rn_bits)?;
1762                reg_bits_checked(rm_bits)?;
1763
1764                // Thumb-2 SDIV: FB90 F0F0 | Rn<<16 | Rd<<8 | Rm
1765                // First halfword: 1111 1011 1001 Rn = 0xFB90 | Rn
1766                // Second halfword: 1111 Rd 1111 Rm = 0xF0F0 | Rd<<8 | Rm
1767                let hw1: u16 = (0xFB90 | rn_bits) as u16;
1768                let hw2: u16 = (0xF0F0 | (rd_bits << 8) | rm_bits) as u16;
1769
1770                // Thumb-2 32-bit instructions: first halfword, then second halfword (little-endian each)
1771                let mut bytes = hw1.to_le_bytes().to_vec();
1772                bytes.extend_from_slice(&hw2.to_le_bytes());
1773                encoding_contracts::verify_thumb32(&bytes);
1774                Ok(bytes)
1775            }
1776
1777            // UDIV: 11111011 1011 Rn 1111 Rd 1111 Rm
1778            ArmOp::Udiv { rd, rn, rm } => {
1779                let rd_bits = reg_to_bits(rd);
1780                let rn_bits = reg_to_bits(rn);
1781                let rm_bits = reg_to_bits(rm);
1782                reg_bits_checked(rd_bits)?;
1783                reg_bits_checked(rn_bits)?;
1784                reg_bits_checked(rm_bits)?;
1785
1786                // Thumb-2 UDIV: FBB0 F0F0 | Rn<<16 | Rd<<8 | Rm
1787                let hw1: u16 = (0xFBB0 | rn_bits) as u16;
1788                let hw2: u16 = (0xF0F0 | (rd_bits << 8) | rm_bits) as u16;
1789
1790                let mut bytes = hw1.to_le_bytes().to_vec();
1791                bytes.extend_from_slice(&hw2.to_le_bytes());
1792                encoding_contracts::verify_thumb32(&bytes);
1793                Ok(bytes)
1794            }
1795
1796            // MUL (Thumb-2 32-bit): MUL Rd, Rn, Rm
1797            ArmOp::Mul { rd, rn, rm } => {
1798                let rd_bits = reg_to_bits(rd);
1799                let rn_bits = reg_to_bits(rn);
1800                let rm_bits = reg_to_bits(rm);
1801
1802                // Thumb-2 MUL: FB00 F000 | Rn | Rd<<8 | Rm
1803                // 11111011 0000 Rn | 1111 Rd 0000 Rm
1804                let hw1: u16 = (0xFB00 | rn_bits) as u16;
1805                let hw2: u16 = (0xF000 | (rd_bits << 8) | rm_bits) as u16;
1806
1807                let mut bytes = hw1.to_le_bytes().to_vec();
1808                bytes.extend_from_slice(&hw2.to_le_bytes());
1809                Ok(bytes)
1810            }
1811
1812            // MLS: Rd = Ra - Rn * Rm
1813            ArmOp::Mls { rd, rn, rm, ra } => {
1814                let rd_bits = reg_to_bits(rd);
1815                let rn_bits = reg_to_bits(rn);
1816                let rm_bits = reg_to_bits(rm);
1817                let ra_bits = reg_to_bits(ra);
1818
1819                // Thumb-2 MLS: FB00 Rn | Ra Rd 0001 Rm
1820                // 11111011 0000 Rn | Ra Rd 0001 Rm
1821                let hw1: u16 = (0xFB00 | rn_bits) as u16;
1822                let hw2: u16 = ((ra_bits << 12) | (rd_bits << 8) | 0x10 | rm_bits) as u16;
1823
1824                let mut bytes = hw1.to_le_bytes().to_vec();
1825                bytes.extend_from_slice(&hw2.to_le_bytes());
1826                Ok(bytes)
1827            }
1828
1829            // AND (Thumb-2 32-bit)
1830            ArmOp::And { rd, rn, op2 } => {
1831                if let Operand2::Reg(rm) = op2 {
1832                    let rd_bits = reg_to_bits(rd);
1833                    let rn_bits = reg_to_bits(rn);
1834                    let rm_bits = reg_to_bits(rm);
1835
1836                    // Thumb-2 AND register: EA00 Rn | 0 Rd 00 00 Rm
1837                    let hw1: u16 = (0xEA00 | rn_bits) as u16;
1838                    let hw2: u16 = ((rd_bits << 8) | rm_bits) as u16;
1839
1840                    let mut bytes = hw1.to_le_bytes().to_vec();
1841                    bytes.extend_from_slice(&hw2.to_le_bytes());
1842                    Ok(bytes)
1843                } else if let Operand2::Imm(imm) = op2 {
1844                    let rd_bits = reg_to_bits(rd);
1845                    let rn_bits = reg_to_bits(rn);
1846                    let imm_val = *imm as u32;
1847
1848                    // Thumb-2 AND.W immediate T1: 11110 i 0 0000 S Rn | 0 imm3 Rd imm8
1849                    let i_bit = (imm_val >> 11) & 1;
1850                    let imm3 = (imm_val >> 8) & 0x7;
1851                    let imm8 = imm_val & 0xFF;
1852
1853                    let hw1: u16 = (0xF000 | (i_bit << 10) | rn_bits) as u16;
1854                    let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
1855
1856                    let mut bytes = hw1.to_le_bytes().to_vec();
1857                    bytes.extend_from_slice(&hw2.to_le_bytes());
1858                    Ok(bytes)
1859                } else {
1860                    // RegShift variant - fallback to NOP
1861                    let instr: u16 = 0xBF00;
1862                    Ok(instr.to_le_bytes().to_vec())
1863                }
1864            }
1865
1866            // ORR (Thumb-2 32-bit)
1867            ArmOp::Orr { rd, rn, op2 } => {
1868                if let Operand2::Reg(rm) = op2 {
1869                    let rd_bits = reg_to_bits(rd);
1870                    let rn_bits = reg_to_bits(rn);
1871                    let rm_bits = reg_to_bits(rm);
1872
1873                    // Thumb-2 ORR: EA40 Rn | 0 Rd 00 00 Rm
1874                    let hw1: u16 = (0xEA40 | rn_bits) as u16;
1875                    let hw2: u16 = ((rd_bits << 8) | rm_bits) as u16;
1876
1877                    let mut bytes = hw1.to_le_bytes().to_vec();
1878                    bytes.extend_from_slice(&hw2.to_le_bytes());
1879                    Ok(bytes)
1880                } else {
1881                    let instr: u16 = 0xBF00;
1882                    Ok(instr.to_le_bytes().to_vec())
1883                }
1884            }
1885
1886            // EOR (Thumb-2 32-bit)
1887            ArmOp::Eor { rd, rn, op2 } => {
1888                if let Operand2::Reg(rm) = op2 {
1889                    let rd_bits = reg_to_bits(rd);
1890                    let rn_bits = reg_to_bits(rn);
1891                    let rm_bits = reg_to_bits(rm);
1892
1893                    // Thumb-2 EOR: EA80 Rn | 0 Rd 00 00 Rm
1894                    let hw1: u16 = (0xEA80 | rn_bits) as u16;
1895                    let hw2: u16 = ((rd_bits << 8) | rm_bits) as u16;
1896
1897                    let mut bytes = hw1.to_le_bytes().to_vec();
1898                    bytes.extend_from_slice(&hw2.to_le_bytes());
1899                    Ok(bytes)
1900                } else {
1901                    let instr: u16 = 0xBF00;
1902                    Ok(instr.to_le_bytes().to_vec())
1903                }
1904            }
1905
1906            // Shift operations (16-bit for low registers)
1907            ArmOp::Lsl { rd, rn, shift } => {
1908                let rd_bits = reg_to_bits(rd) as u16;
1909                let rn_bits = reg_to_bits(rn) as u16;
1910                let shift_bits = (*shift as u16) & 0x1F;
1911
1912                if rd_bits < 8 && rn_bits < 8 {
1913                    // LSLS Rd, Rm, #imm5 (16-bit): 0000 0 imm5 Rm Rd
1914                    let instr: u16 = (shift_bits << 6) | (rn_bits << 3) | rd_bits;
1915                    Ok(instr.to_le_bytes().to_vec())
1916                } else {
1917                    // Use 32-bit encoding for high registers
1918                    self.encode_thumb32_shift(rd, rn, *shift, 0b00) // LSL type
1919                }
1920            }
1921
1922            ArmOp::Lsr { rd, rn, shift } => {
1923                let rd_bits = reg_to_bits(rd) as u16;
1924                let rn_bits = reg_to_bits(rn) as u16;
1925                let shift_bits = (*shift as u16) & 0x1F;
1926
1927                if rd_bits < 8 && rn_bits < 8 && shift_bits > 0 {
1928                    // LSRS Rd, Rm, #imm5 (16-bit): 0000 1 imm5 Rm Rd
1929                    let instr: u16 = 0x0800 | (shift_bits << 6) | (rn_bits << 3) | rd_bits;
1930                    Ok(instr.to_le_bytes().to_vec())
1931                } else {
1932                    self.encode_thumb32_shift(rd, rn, *shift, 0b01) // LSR type
1933                }
1934            }
1935
1936            ArmOp::Asr { rd, rn, shift } => {
1937                let rd_bits = reg_to_bits(rd) as u16;
1938                let rn_bits = reg_to_bits(rn) as u16;
1939                let shift_bits = (*shift as u16) & 0x1F;
1940
1941                if rd_bits < 8 && rn_bits < 8 && shift_bits > 0 {
1942                    // ASRS Rd, Rm, #imm5 (16-bit): 0001 0 imm5 Rm Rd
1943                    let instr: u16 = 0x1000 | (shift_bits << 6) | (rn_bits << 3) | rd_bits;
1944                    Ok(instr.to_le_bytes().to_vec())
1945                } else {
1946                    self.encode_thumb32_shift(rd, rn, *shift, 0b10) // ASR type
1947                }
1948            }
1949
1950            ArmOp::Ror { rd, rn, shift } => {
1951                // ROR doesn't have a 16-bit immediate form, use 32-bit
1952                self.encode_thumb32_shift(rd, rn, *shift, 0b11) // ROR type
1953            }
1954
1955            // Register-based shifts (Thumb-2 32-bit)
1956            // Encoding: 11111010 0xxS Rn 1111 Rd 0000 Rm
1957            // xx = shift type: 00=LSL, 01=LSR, 10=ASR, 11=ROR
1958            ArmOp::LslReg { rd, rn, rm } => self.encode_thumb32_shift_reg(rd, rn, rm, 0b00),
1959            ArmOp::LsrReg { rd, rn, rm } => self.encode_thumb32_shift_reg(rd, rn, rm, 0b01),
1960            ArmOp::AsrReg { rd, rn, rm } => self.encode_thumb32_shift_reg(rd, rn, rm, 0b10),
1961            ArmOp::RorReg { rd, rn, rm } => self.encode_thumb32_shift_reg(rd, rn, rm, 0b11),
1962
1963            // RSB (Reverse Subtract): Rd = imm - Rn
1964            // Thumb-2 T2 encoding: 11110 i 0 1110 S Rn | 0 imm3 Rd imm8
1965            ArmOp::Rsb { rd, rn, imm } => {
1966                let rd_bits = reg_to_bits(rd);
1967                let rn_bits = reg_to_bits(rn);
1968                let imm_val = *imm;
1969
1970                let i_bit = (imm_val >> 11) & 1;
1971                let imm3 = (imm_val >> 8) & 0x7;
1972                let imm8 = imm_val & 0xFF;
1973
1974                // hw1: 11110 i 01110 0 Rn  (S=0)
1975                let hw1: u16 = (0xF1C0 | (i_bit << 10) | rn_bits) as u16;
1976                // hw2: 0 imm3 Rd imm8
1977                let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
1978
1979                let mut bytes = hw1.to_le_bytes().to_vec();
1980                bytes.extend_from_slice(&hw2.to_le_bytes());
1981                Ok(bytes)
1982            }
1983
1984            // CLZ (Thumb-2 32-bit)
1985            ArmOp::Clz { rd, rm } => {
1986                let rd_bits = reg_to_bits(rd);
1987                let rm_bits = reg_to_bits(rm);
1988
1989                // Thumb-2 CLZ: FAB0 Rm | F8 Rd Rm
1990                // 11111010 1011 Rm | 1111 1000 Rd Rm
1991                let hw1: u16 = (0xFAB0 | rm_bits) as u16;
1992                let hw2: u16 = (0xF080 | (rd_bits << 8) | rm_bits) as u16;
1993
1994                let mut bytes = hw1.to_le_bytes().to_vec();
1995                bytes.extend_from_slice(&hw2.to_le_bytes());
1996                Ok(bytes)
1997            }
1998
1999            // RBIT (Thumb-2 32-bit)
2000            ArmOp::Rbit { rd, rm } => {
2001                let rd_bits = reg_to_bits(rd);
2002                let rm_bits = reg_to_bits(rm);
2003
2004                // Thumb-2 RBIT: FA90 Rm | F0 Rd A0 Rm
2005                // 11111010 1001 Rm | 1111 Rd 1010 Rm
2006                let hw1: u16 = (0xFA90 | rm_bits) as u16;
2007                let hw2: u16 = (0xF0A0 | (rd_bits << 8) | rm_bits) as u16;
2008
2009                let mut bytes = hw1.to_le_bytes().to_vec();
2010                bytes.extend_from_slice(&hw2.to_le_bytes());
2011                Ok(bytes)
2012            }
2013
2014            // SXTB (16-bit for low registers)
2015            ArmOp::Sxtb { rd, rm } => {
2016                let rd_bits = reg_to_bits(rd) as u16;
2017                let rm_bits = reg_to_bits(rm) as u16;
2018
2019                if rd_bits < 8 && rm_bits < 8 {
2020                    // SXTB Rd, Rm (16-bit): 1011 0010 01 Rm Rd
2021                    let instr: u16 = 0xB240 | (rm_bits << 3) | rd_bits;
2022                    Ok(instr.to_le_bytes().to_vec())
2023                } else {
2024                    // Thumb-2 SXTB.W: FA4F F(rd)80 (rm)
2025                    // 11111010 0100 1111 | 1111 Rd 10 rotate Rm
2026                    let rd_bits32 = rd_bits as u32;
2027                    let rm_bits32 = rm_bits as u32;
2028                    let hw1: u16 = 0xFA4F;
2029                    let hw2: u16 = (0xF080 | (rd_bits32 << 8) | rm_bits32) as u16;
2030                    let mut bytes = hw1.to_le_bytes().to_vec();
2031                    bytes.extend_from_slice(&hw2.to_le_bytes());
2032                    Ok(bytes)
2033                }
2034            }
2035
2036            // SXTH (16-bit for low registers)
2037            ArmOp::Sxth { rd, rm } => {
2038                let rd_bits = reg_to_bits(rd) as u16;
2039                let rm_bits = reg_to_bits(rm) as u16;
2040
2041                if rd_bits < 8 && rm_bits < 8 {
2042                    // SXTH Rd, Rm (16-bit): 1011 0010 00 Rm Rd
2043                    let instr: u16 = 0xB200 | (rm_bits << 3) | rd_bits;
2044                    Ok(instr.to_le_bytes().to_vec())
2045                } else {
2046                    // Thumb-2 SXTH.W: FA0F F(rd)80 (rm)
2047                    // 11111010 0000 1111 | 1111 Rd 10 rotate Rm
2048                    let rd_bits32 = rd_bits as u32;
2049                    let rm_bits32 = rm_bits as u32;
2050                    let hw1: u16 = 0xFA0F;
2051                    let hw2: u16 = (0xF080 | (rd_bits32 << 8) | rm_bits32) as u16;
2052                    let mut bytes = hw1.to_le_bytes().to_vec();
2053                    bytes.extend_from_slice(&hw2.to_le_bytes());
2054                    Ok(bytes)
2055                }
2056            }
2057
2058            // CMP (can be 16-bit for low registers)
2059            ArmOp::Cmp { rn, op2 } => {
2060                let rn_bits = reg_to_bits(rn) as u16;
2061
2062                if let Operand2::Imm(imm) = op2 {
2063                    // Only use 16-bit encoding for non-negative immediates 0-255
2064                    // Negative immediates must use 32-bit encoding
2065                    if *imm >= 0 && *imm <= 255 && rn_bits < 8 {
2066                        // CMP Rn, #imm8 (16-bit): 0010 1 Rn imm8
2067                        let instr: u16 = 0x2800 | (rn_bits << 8) | (*imm as u16 & 0xFF);
2068                        Ok(instr.to_le_bytes().to_vec())
2069                    } else {
2070                        self.encode_thumb32_cmp_imm(rn, *imm as u32)
2071                    }
2072                } else if let Operand2::Reg(rm) = op2 {
2073                    let rm_bits = reg_to_bits(rm) as u16;
2074                    if rn_bits < 8 && rm_bits < 8 {
2075                        // CMP Rn, Rm (16-bit low): 0100 0010 10 Rm Rn
2076                        let instr: u16 = 0x4280 | (rm_bits << 3) | rn_bits;
2077                        Ok(instr.to_le_bytes().to_vec())
2078                    } else {
2079                        // CMP Rn, Rm (16-bit high): 0100 0101 N Rm Rn[2:0]
2080                        let n_bit = (rn_bits >> 3) & 1;
2081                        let instr: u16 = 0x4500 | (n_bit << 7) | (rm_bits << 3) | (rn_bits & 0x7);
2082                        Ok(instr.to_le_bytes().to_vec())
2083                    }
2084                } else {
2085                    let instr: u16 = 0xBF00;
2086                    Ok(instr.to_le_bytes().to_vec())
2087                }
2088            }
2089
2090            // CMN (Compare Negative) - computes Rn + op2 and sets flags
2091            // CMN Rn, #1 sets Z flag if Rn == -1 (since -1 + 1 = 0)
2092            ArmOp::Cmn { rn, op2 } => {
2093                let rn_bits = reg_to_bits(rn) as u16;
2094
2095                if let Operand2::Imm(imm) = op2 {
2096                    // CMN.W Rn, #imm (32-bit encoding)
2097                    // Encoding: F110 Rn | 0F00 imm8 (for small immediates 0-255)
2098                    if *imm >= 0 && *imm <= 255 {
2099                        let imm8 = *imm as u16 & 0xFF;
2100                        let hw1: u16 = 0xF110 | rn_bits;
2101                        let hw2: u16 = 0x0F00 | imm8;
2102                        let mut bytes = hw1.to_le_bytes().to_vec();
2103                        bytes.extend_from_slice(&hw2.to_le_bytes());
2104                        Ok(bytes)
2105                    } else {
2106                        // For other immediates, fallback to NOP (should not happen in our use case)
2107                        Ok(vec![0xBF, 0x00])
2108                    }
2109                } else if let Operand2::Reg(rm) = op2 {
2110                    let rm_bits = reg_to_bits(rm) as u16;
2111                    // 16-bit CMN (T1) only encodes R0-R7; high registers overflow
2112                    // the 3-bit fields and corrupt the operands (#184, the #180
2113                    // class). CMN has no high-register 16-bit form, so fall back
2114                    // to 32-bit CMN.W (T2): EB10 Rn | 0F00 Rm (ADD.W with S=1 and
2115                    // Rd discarded as PC/1111).
2116                    if rn_bits < 8 && rm_bits < 8 {
2117                        // CMN Rn, Rm (16-bit): 0100 0010 11 Rm Rn
2118                        let instr: u16 = 0x42C0 | (rm_bits << 3) | rn_bits;
2119                        Ok(instr.to_le_bytes().to_vec())
2120                    } else {
2121                        let hw1: u16 = 0xEB10 | rn_bits;
2122                        let hw2: u16 = 0x0F00 | rm_bits;
2123                        let mut bytes = hw1.to_le_bytes().to_vec();
2124                        bytes.extend_from_slice(&hw2.to_le_bytes());
2125                        Ok(bytes)
2126                    }
2127                } else {
2128                    Ok(vec![0xBF, 0x00])
2129                }
2130            }
2131
2132            // LDR (can be 16-bit for simple cases)
2133            ArmOp::Ldr { rd, addr } => {
2134                let rd_bits = reg_to_bits(rd);
2135                let base_bits = reg_to_bits(&addr.base);
2136
2137                // Handle register offset mode [base, Roff] or [base, Roff, #imm]
2138                if let Some(offset_reg) = &addr.offset_reg {
2139                    let rm_bits = reg_to_bits(offset_reg);
2140
2141                    // If there's also an immediate offset, we need to ADD it first
2142                    if addr.offset != 0 {
2143                        // Use R12 (IP) as scratch to avoid clobbering the address register
2144                        // ADD R12, Rm, #offset; LDR Rd, [base, R12]
2145                        let scratch = Reg::R12;
2146                        let mut bytes =
2147                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2148                        bytes.extend(self.encode_thumb32_ldr_reg(rd, &addr.base, &scratch)?);
2149                        return Ok(bytes);
2150                    }
2151
2152                    // Simple register offset: LDR Rd, [Rn, Rm]
2153                    // 16-bit: only if Rd, Rn, Rm < R8
2154                    if rd_bits < 8 && base_bits < 8 && rm_bits < 8 {
2155                        // LDR Rd, [Rn, Rm] (16-bit): 0101 100 Rm Rn Rd
2156                        let instr: u16 = 0x5800
2157                            | ((rm_bits as u16) << 6)
2158                            | ((base_bits as u16) << 3)
2159                            | (rd_bits as u16);
2160                        return Ok(instr.to_le_bytes().to_vec());
2161                    }
2162
2163                    // 32-bit register offset
2164                    return self.encode_thumb32_ldr_reg(rd, &addr.base, offset_reg);
2165                }
2166
2167                // Immediate offset mode [base, #imm]
2168                let offset = addr.offset as u32;
2169
2170                if rd_bits < 8 && base_bits < 8 && (offset & 0x3) == 0 && offset <= 124 {
2171                    // LDR Rd, [Rn, #imm5*4] (16-bit): 0110 1 imm5 Rn Rd
2172                    let imm5 = (offset >> 2) as u16;
2173                    let instr: u16 =
2174                        0x6800 | (imm5 << 6) | ((base_bits as u16) << 3) | (rd_bits as u16);
2175                    Ok(instr.to_le_bytes().to_vec())
2176                } else {
2177                    self.encode_thumb32_ldr(rd, &addr.base, offset)
2178                }
2179            }
2180
2181            // STR (can be 16-bit for simple cases)
2182            ArmOp::Str { rd, addr } => {
2183                let rd_bits = reg_to_bits(rd);
2184                let base_bits = reg_to_bits(&addr.base);
2185
2186                // Handle register offset mode [base, Roff] or [base, Roff, #imm]
2187                if let Some(offset_reg) = &addr.offset_reg {
2188                    let rm_bits = reg_to_bits(offset_reg);
2189
2190                    // If there's also an immediate offset, we need to ADD it first
2191                    if addr.offset != 0 {
2192                        // Use R12 (IP) as scratch to avoid clobbering the address register
2193                        // ADD R12, Rm, #offset; STR Rd, [base, R12]
2194                        let scratch = Reg::R12;
2195                        let mut bytes =
2196                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2197                        bytes.extend(self.encode_thumb32_str_reg(rd, &addr.base, &scratch)?);
2198                        return Ok(bytes);
2199                    }
2200
2201                    // Simple register offset: STR Rd, [Rn, Rm]
2202                    // 16-bit: only if Rd, Rn, Rm < R8
2203                    if rd_bits < 8 && base_bits < 8 && rm_bits < 8 {
2204                        // STR Rd, [Rn, Rm] (16-bit): 0101 000 Rm Rn Rd
2205                        let instr: u16 = 0x5000
2206                            | ((rm_bits as u16) << 6)
2207                            | ((base_bits as u16) << 3)
2208                            | (rd_bits as u16);
2209                        return Ok(instr.to_le_bytes().to_vec());
2210                    }
2211
2212                    // 32-bit register offset
2213                    return self.encode_thumb32_str_reg(rd, &addr.base, offset_reg);
2214                }
2215
2216                // Immediate offset mode [base, #imm]
2217                let offset = addr.offset as u32;
2218
2219                if rd_bits < 8 && base_bits < 8 && (offset & 0x3) == 0 && offset <= 124 {
2220                    // STR Rd, [Rn, #imm5*4] (16-bit): 0110 0 imm5 Rn Rd
2221                    let imm5 = (offset >> 2) as u16;
2222                    let instr: u16 =
2223                        0x6000 | (imm5 << 6) | ((base_bits as u16) << 3) | (rd_bits as u16);
2224                    Ok(instr.to_le_bytes().to_vec())
2225                } else {
2226                    self.encode_thumb32_str(rd, &addr.base, offset)
2227                }
2228            }
2229
2230            // LDRB (Thumb-2)
2231            ArmOp::Ldrb { rd, addr } => {
2232                let rd_bits = reg_to_bits(rd);
2233                let base_bits = reg_to_bits(&addr.base);
2234
2235                if let Some(offset_reg) = &addr.offset_reg {
2236                    if addr.offset != 0 {
2237                        let scratch = Reg::R12;
2238                        let mut bytes =
2239                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2240                        bytes.extend(self.encode_thumb32_ldrb_reg(rd, &addr.base, &scratch)?);
2241                        return Ok(bytes);
2242                    }
2243                    return self.encode_thumb32_ldrb_reg(rd, &addr.base, offset_reg);
2244                }
2245
2246                let offset = addr.offset as u32;
2247                if rd_bits < 8 && base_bits < 8 && offset <= 31 {
2248                    // LDRB Rd, [Rn, #imm5] (16-bit): 0111 1 imm5 Rn Rd
2249                    let instr: u16 = 0x7800
2250                        | ((offset as u16) << 6)
2251                        | ((base_bits as u16) << 3)
2252                        | (rd_bits as u16);
2253                    Ok(instr.to_le_bytes().to_vec())
2254                } else {
2255                    self.encode_thumb32_ldrb_imm(rd, &addr.base, offset)
2256                }
2257            }
2258
2259            // LDRSB (Thumb-2)
2260            ArmOp::Ldrsb { rd, addr } => {
2261                let rd_bits = reg_to_bits(rd);
2262                let base_bits = reg_to_bits(&addr.base);
2263
2264                if let Some(offset_reg) = &addr.offset_reg {
2265                    if addr.offset != 0 {
2266                        let scratch = Reg::R12;
2267                        let mut bytes =
2268                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2269                        bytes.extend(self.encode_thumb32_ldrsb_reg(rd, &addr.base, &scratch)?);
2270                        return Ok(bytes);
2271                    }
2272                    return self.encode_thumb32_ldrsb_reg(rd, &addr.base, offset_reg);
2273                }
2274
2275                let offset = addr.offset as u32;
2276                // LDRSB has no 16-bit immediate form (only register)
2277                // For 16-bit reg form: only if Rd, Rn, Rm < R8
2278                if rd_bits < 8 && base_bits < 8 && offset == 0 {
2279                    // No immediate 16-bit encoding for LDRSB; use 32-bit
2280                    self.encode_thumb32_ldrsb_imm(rd, &addr.base, offset)
2281                } else {
2282                    self.encode_thumb32_ldrsb_imm(rd, &addr.base, offset)
2283                }
2284            }
2285
2286            // LDRH (Thumb-2)
2287            ArmOp::Ldrh { rd, addr } => {
2288                let rd_bits = reg_to_bits(rd);
2289                let base_bits = reg_to_bits(&addr.base);
2290
2291                if let Some(offset_reg) = &addr.offset_reg {
2292                    if addr.offset != 0 {
2293                        let scratch = Reg::R12;
2294                        let mut bytes =
2295                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2296                        bytes.extend(self.encode_thumb32_ldrh_reg(rd, &addr.base, &scratch)?);
2297                        return Ok(bytes);
2298                    }
2299                    return self.encode_thumb32_ldrh_reg(rd, &addr.base, offset_reg);
2300                }
2301
2302                let offset = addr.offset as u32;
2303                if rd_bits < 8 && base_bits < 8 && (offset & 0x1) == 0 && offset <= 62 {
2304                    // LDRH Rd, [Rn, #imm5*2] (16-bit): 1000 1 imm5 Rn Rd
2305                    let imm5 = (offset >> 1) as u16;
2306                    let instr: u16 =
2307                        0x8800 | (imm5 << 6) | ((base_bits as u16) << 3) | (rd_bits as u16);
2308                    Ok(instr.to_le_bytes().to_vec())
2309                } else {
2310                    self.encode_thumb32_ldrh_imm(rd, &addr.base, offset)
2311                }
2312            }
2313
2314            // LDRSH (Thumb-2)
2315            ArmOp::Ldrsh { rd, addr } => {
2316                if let Some(offset_reg) = &addr.offset_reg {
2317                    if addr.offset != 0 {
2318                        let scratch = Reg::R12;
2319                        let mut bytes =
2320                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2321                        bytes.extend(self.encode_thumb32_ldrsh_reg(rd, &addr.base, &scratch)?);
2322                        return Ok(bytes);
2323                    }
2324                    return self.encode_thumb32_ldrsh_reg(rd, &addr.base, offset_reg);
2325                }
2326
2327                let offset = addr.offset as u32;
2328                self.encode_thumb32_ldrsh_imm(rd, &addr.base, offset)
2329            }
2330
2331            // STRB (Thumb-2)
2332            ArmOp::Strb { rd, addr } => {
2333                let rd_bits = reg_to_bits(rd);
2334                let base_bits = reg_to_bits(&addr.base);
2335
2336                if let Some(offset_reg) = &addr.offset_reg {
2337                    if addr.offset != 0 {
2338                        let scratch = Reg::R12;
2339                        let mut bytes =
2340                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2341                        bytes.extend(self.encode_thumb32_strb_reg(rd, &addr.base, &scratch)?);
2342                        return Ok(bytes);
2343                    }
2344                    return self.encode_thumb32_strb_reg(rd, &addr.base, offset_reg);
2345                }
2346
2347                let offset = addr.offset as u32;
2348                if rd_bits < 8 && base_bits < 8 && offset <= 31 {
2349                    // STRB Rd, [Rn, #imm5] (16-bit): 0111 0 imm5 Rn Rd
2350                    let instr: u16 = 0x7000
2351                        | ((offset as u16) << 6)
2352                        | ((base_bits as u16) << 3)
2353                        | (rd_bits as u16);
2354                    Ok(instr.to_le_bytes().to_vec())
2355                } else {
2356                    self.encode_thumb32_strb_imm(rd, &addr.base, offset)
2357                }
2358            }
2359
2360            // STRH (Thumb-2)
2361            ArmOp::Strh { rd, addr } => {
2362                let rd_bits = reg_to_bits(rd);
2363                let base_bits = reg_to_bits(&addr.base);
2364
2365                if let Some(offset_reg) = &addr.offset_reg {
2366                    if addr.offset != 0 {
2367                        let scratch = Reg::R12;
2368                        let mut bytes =
2369                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2370                        bytes.extend(self.encode_thumb32_strh_reg(rd, &addr.base, &scratch)?);
2371                        return Ok(bytes);
2372                    }
2373                    return self.encode_thumb32_strh_reg(rd, &addr.base, offset_reg);
2374                }
2375
2376                let offset = addr.offset as u32;
2377                if rd_bits < 8 && base_bits < 8 && (offset & 0x1) == 0 && offset <= 62 {
2378                    // STRH Rd, [Rn, #imm5*2] (16-bit): 1000 0 imm5 Rn Rd
2379                    let imm5 = (offset >> 1) as u16;
2380                    let instr: u16 =
2381                        0x8000 | (imm5 << 6) | ((base_bits as u16) << 3) | (rd_bits as u16);
2382                    Ok(instr.to_le_bytes().to_vec())
2383                } else {
2384                    self.encode_thumb32_strh_imm(rd, &addr.base, offset)
2385                }
2386            }
2387
2388            // MemorySize (Thumb-2)
2389            ArmOp::MemorySize { rd } => {
2390                // LSR rd, R10, #16 — memory size in bytes / 65536 = pages
2391                // Thumb-2 16-bit: LSRS Rd, Rm, #imm5 — 0000 1 imm5 Rm Rd
2392                let rd_bits = reg_to_bits(rd);
2393                let r10_bits = reg_to_bits(&Reg::R10);
2394                if rd_bits < 8 && r10_bits < 8 {
2395                    let instr: u16 =
2396                        0x0800 | (16u16 << 6) | ((r10_bits as u16) << 3) | (rd_bits as u16);
2397                    Ok(instr.to_le_bytes().to_vec())
2398                } else {
2399                    // Thumb-2 32-bit LSR: 1110 1010 010 0 1111 | 0 imm3 Rd imm2 01 Rm
2400                    let imm5: u32 = 16;
2401                    let imm3 = (imm5 >> 2) & 0x7;
2402                    let imm2 = imm5 & 0x3;
2403                    let hw1: u16 = 0xEA4F;
2404                    let hw2: u16 =
2405                        ((imm3 << 12) | (rd_bits << 8) | (imm2 << 6) | 0x10 | r10_bits) as u16;
2406                    let mut bytes = hw1.to_le_bytes().to_vec();
2407                    bytes.extend_from_slice(&hw2.to_le_bytes());
2408                    Ok(bytes)
2409                }
2410            }
2411
2412            // MemoryGrow (Thumb-2)
2413            ArmOp::MemoryGrow { rd, .. } => {
2414                // On embedded with fixed memory, always return -1 (failure)
2415                // MVN rd, #0 → MOV rd, #-1
2416                // Thumb-2 32-bit: MVN: 1111 0 i 0 0 0 1 1 0 1111 | 0 imm3 Rd imm8
2417                let rd_bits = reg_to_bits(rd);
2418                let hw1: u16 = 0xF06F; // MVN with i=0
2419                let hw2: u16 = (rd_bits << 8) as u16; // imm8=0 → ~0 = 0xFFFFFFFF = -1
2420                let mut bytes = hw1.to_le_bytes().to_vec();
2421                bytes.extend_from_slice(&hw2.to_le_bytes());
2422                Ok(bytes)
2423            }
2424
2425            // BX (16-bit)
2426            ArmOp::Bx { rm } => {
2427                let rm_bits = reg_to_bits(rm) as u16;
2428                // BX Rm (16-bit): 0100 0111 0 Rm 000
2429                let instr: u16 = 0x4700 | (rm_bits << 3);
2430                Ok(instr.to_le_bytes().to_vec())
2431            }
2432
2433            // BLX (16-bit) - Branch with Link and Exchange
2434            // BLX Rm: 0100 0111 1 Rm 000
2435            ArmOp::Blx { rm } => {
2436                let rm_bits = reg_to_bits(rm) as u16;
2437                let instr: u16 = 0x4780 | (rm_bits << 3);
2438                Ok(instr.to_le_bytes().to_vec())
2439            }
2440
2441            // CallIndirect - indirect function call via table lookup
2442            // table_index_reg contains the table index
2443            // Generates: LSL R12, idx, #2; LDR R12, [R12, table_base]; BLX R12
2444            ArmOp::CallIndirect {
2445                rd: _,
2446                type_idx: _,
2447                table_index_reg,
2448            } => {
2449                let idx_reg = reg_to_bits(table_index_reg);
2450                let mut bytes = Vec::new();
2451
2452                // For now, we generate code that:
2453                // 1. Multiplies index by 4 (function pointer size)
2454                // 2. Loads function pointer from table (assumes table base in R11)
2455                // 3. Calls the function via BLX
2456                //
2457                // Table base setup must be done by caller/runtime.
2458                // This is a simplified implementation - full support needs:
2459                // - Table base address resolution
2460                // - Type signature checking
2461                // - Bounds checking
2462
2463                // LSL R12, idx_reg, #2 (multiply index by 4)
2464                // Thumb-2 MOV with shift: 11101010 010 S 1111 | 0 imm3 Rd imm2 type Rm
2465                // LSL: type=00, imm5=2 -> imm3=0, imm2=10
2466                let hw1: u16 = 0xEA4F_u16; // MOV.W R12, Rm, LSL #2
2467                let hw2: u16 = ((0x0C00 | (0b10 << 4)) | idx_reg) as u16;
2468                bytes.extend_from_slice(&hw1.to_le_bytes());
2469                bytes.extend_from_slice(&hw2.to_le_bytes());
2470
2471                // LDR R12, [R11, R12] - load function pointer
2472                // Thumb-2 LDR (register): 1111 1000 0101 Rn | Rt 0000 00 imm2 Rm
2473                // Rn=R11, Rt=R12, Rm=R12, imm2=00 (no shift)
2474                let ldr_hw1: u16 = 0xF85B; // LDR.W Rt, [R11, Rm]
2475                let ldr_hw2: u16 = 0xC00C; // Rt=R12, imm2=00, Rm=R12
2476                bytes.extend_from_slice(&ldr_hw1.to_le_bytes());
2477                bytes.extend_from_slice(&ldr_hw2.to_le_bytes());
2478
2479                // BLX R12 (call function indirectly)
2480                // BLX Rm (16-bit): 0100 0111 1 Rm 000
2481                let blx: u16 = 0x47E0; // BLX R12
2482                bytes.extend_from_slice(&blx.to_le_bytes());
2483
2484                Ok(bytes)
2485            }
2486
2487            // Label pseudo-instruction: emits no machine code
2488            ArmOp::Label { .. } => Ok(Vec::new()),
2489
2490            // Conditional branch to label (generic) - offset 0, will be patched
2491            ArmOp::Bcc { cond, label: _ } => {
2492                use synth_synthesis::Condition;
2493                let cond_bits: u16 = match cond {
2494                    Condition::EQ => 0x0,
2495                    Condition::NE => 0x1,
2496                    Condition::HS => 0x2,
2497                    Condition::LO => 0x3,
2498                    Condition::HI => 0x8,
2499                    Condition::LS => 0x9,
2500                    Condition::GE => 0xA,
2501                    Condition::LT => 0xB,
2502                    Condition::GT => 0xC,
2503                    Condition::LE => 0xD,
2504                };
2505                // 16-bit B<cond> with offset 0: 1101 cond imm8
2506                let instr: u16 = 0xD000 | (cond_bits << 8);
2507                Ok(instr.to_le_bytes().to_vec())
2508            }
2509
2510            // Branch instructions
2511            ArmOp::B { label: _ } => {
2512                // Simplified: B.N with offset 0
2513                // For real usage, would need label resolution
2514                let instr: u16 = 0xE000; // B.N #0
2515                Ok(instr.to_le_bytes().to_vec())
2516            }
2517
2518            // BHS (Branch if Higher or Same) - used for bounds checking
2519            // Condition code: 0x2 (C set)
2520            ArmOp::Bhs { label: _ } => {
2521                // 16-bit B<cond> with offset 0: 1101 cond imm8
2522                // cond = 0x2 (HS)
2523                let instr: u16 = 0xD200; // BHS.N #0
2524                Ok(instr.to_le_bytes().to_vec())
2525            }
2526
2527            // BLO (Branch if Lower) - complementary to BHS
2528            // Condition code: 0x3 (C clear)
2529            ArmOp::Blo { label: _ } => {
2530                // 16-bit B<cond> with offset 0: 1101 cond imm8
2531                // cond = 0x3 (LO)
2532                let instr: u16 = 0xD300; // BLO.N #0
2533                Ok(instr.to_le_bytes().to_vec())
2534            }
2535
2536            // Branch with numeric offset (Thumb-2)
2537            // Thumb-2 B.W instruction: 32-bit with +-16MB range
2538            ArmOp::BOffset { offset } => {
2539                // offset is already the halfword displacement: (target - branch - 4) / 2
2540                // This is the raw encoded value, accounting for variable-length instructions
2541                let halfword_offset = *offset;
2542
2543                // 16-bit B.N encoding: 1110 0 imm11 (11-bit signed halfword offset)
2544                // Range: -1024 to +1022 halfwords
2545                if (-1024..=1022).contains(&halfword_offset) {
2546                    // 16-bit B.N encoding: 1110 0 imm11
2547                    let imm11 = (halfword_offset as u16) & 0x7FF;
2548                    let instr: u16 = 0xE000 | imm11;
2549                    Ok(instr.to_le_bytes().to_vec())
2550                } else {
2551                    // 32-bit B.W encoding for larger offsets
2552                    // First halfword: 1111 0 S imm10
2553                    // Second halfword: 10 J1 0 J2 imm11
2554                    // Total offset = SignExtend(S:I1:I2:imm10:imm11:0)
2555                    // where I1 = NOT(J1 XOR S), I2 = NOT(J2 XOR S)
2556
2557                    // The B.W (T4) encoding packs the signed offset as:
2558                    //   S:I1:I2:imm10:imm11:0  (25-bit signed, halfword-aligned)
2559                    // where J1 = NOT(I1 XOR S), J2 = NOT(I2 XOR S)
2560                    // Input halfword_offset already equals (target - PC - 4) / 2,
2561                    // so the full byte offset = halfword_offset << 1.
2562                    // The encoding fields split that 25-bit signed value (including the
2563                    // implicit trailing zero) as: S | imm10 | imm11
2564                    // with I1 = bit 23 and I2 = bit 22 of the signed offset.
2565                    let signed_offset = halfword_offset << 1; // byte offset
2566                    let s = if signed_offset < 0 { 1u32 } else { 0u32 };
2567                    let uoffset = signed_offset as u32;
2568                    let imm10 = (uoffset >> 12) & 0x3FF; // bits [21:12]
2569                    let imm11 = (uoffset >> 1) & 0x7FF; // bits [11:1]
2570                    let i1 = (uoffset >> 23) & 1; // bit 23
2571                    let i2 = (uoffset >> 22) & 1; // bit 22
2572                    let j1 = (!(i1 ^ s)) & 1; // J1 = NOT(I1 XOR S)
2573                    let j2 = (!(i2 ^ s)) & 1; // J2 = NOT(I2 XOR S)
2574
2575                    let hw1: u16 = (0xF000 | (s << 10) | imm10) as u16;
2576                    let hw2: u16 = (0x9000 | (j1 << 13) | (j2 << 11) | imm11) as u16;
2577
2578                    let mut bytes = hw1.to_le_bytes().to_vec();
2579                    bytes.extend_from_slice(&hw2.to_le_bytes());
2580                    Ok(bytes)
2581                }
2582            }
2583
2584            // Conditional branch with numeric offset (Thumb-2)
2585            ArmOp::BCondOffset { cond, offset } => {
2586                use synth_synthesis::Condition;
2587                let cond_bits: u16 = match cond {
2588                    Condition::EQ => 0x0,
2589                    Condition::NE => 0x1,
2590                    Condition::HS => 0x2,
2591                    Condition::LO => 0x3,
2592                    Condition::HI => 0x8,
2593                    Condition::LS => 0x9,
2594                    Condition::GE => 0xA,
2595                    Condition::LT => 0xB,
2596                    Condition::GT => 0xC,
2597                    Condition::LE => 0xD,
2598                };
2599
2600                // offset is already the halfword displacement: (target - branch - 4) / 2
2601                // This is the raw imm8 value for 16-bit B<cond> encoding
2602                let halfword_offset = *offset;
2603
2604                // 16-bit B<cond> encoding: 1101 cond imm8
2605                // Range: -256 to +254 halfwords (imm8 is sign-extended and shifted left 1)
2606                if (-128..=127).contains(&halfword_offset) {
2607                    let imm8 = (halfword_offset as u16) & 0xFF;
2608                    let instr: u16 = 0xD000 | (cond_bits << 8) | imm8;
2609                    Ok(instr.to_le_bytes().to_vec())
2610                } else {
2611                    // 32-bit B<cond>.W for larger offsets
2612                    // First halfword: 1111 0 S cond imm6
2613                    // Second halfword: 10 J1 0 J2 imm11
2614                    let offset = halfword_offset >> 1;
2615                    let s = if offset < 0 { 1u32 } else { 0u32 };
2616                    let imm6 = ((offset >> 11) as u32) & 0x3F;
2617                    let imm11 = (offset as u32) & 0x7FF;
2618                    let j1 = if s == 1 { 1 } else { 0 };
2619                    let j2 = if s == 1 { 1 } else { 0 };
2620
2621                    let hw1: u16 = (0xF000 | (s << 10) | ((cond_bits as u32) << 6) | imm6) as u16;
2622                    let hw2: u16 = (0x8000 | (j1 << 13) | (j2 << 11) | imm11) as u16;
2623
2624                    let mut bytes = hw1.to_le_bytes().to_vec();
2625                    bytes.extend_from_slice(&hw2.to_le_bytes());
2626                    Ok(bytes)
2627                }
2628            }
2629
2630            ArmOp::Bl { label: _ } => {
2631                // BL is always 32-bit in Thumb-2, encoded here as a relocatable
2632                // placeholder; an R_ARM_THM_CALL relocation patches the target
2633                // (see arm_backend.rs). The placeholder must carry an embedded
2634                // addend of -4 so the relocation nets to exactly the symbol S.
2635                //
2636                // Thumb BL computes `target = (P + 4) + signed_offset`. Under
2637                // R_ARM_THM_CALL the linker resolves using the in-place addend;
2638                // a 0xF800 placeholder (addend 0) lands at S+4 — every call one
2639                // instruction past the callee entry (#174). The correct
2640                // placeholder is what `gas` emits for `bl <extern>`:
2641                //   f7ff fffe  ->  `bl <self>`  (S=1, J1=J2=1, imm = -4 addend),
2642                // i.e. hw1=0xF7FF, hw2=0xFFFE. This nets to S, not S+4.
2643                // (The earlier 0xD000 was worse still — a ~+0x600000 addend,
2644                // the garbage `bl c0000c` and "truncated to fit" of #167.)
2645                let hw1: u16 = 0xF7FF;
2646                let hw2: u16 = 0xFFFE;
2647                let mut bytes = hw1.to_le_bytes().to_vec();
2648                bytes.extend_from_slice(&hw2.to_le_bytes());
2649                Ok(bytes)
2650            }
2651
2652            // MVN
2653            ArmOp::Mvn { rd, op2 } => {
2654                if let Operand2::Reg(rm) = op2 {
2655                    let rd_bits = reg_to_bits(rd) as u16;
2656                    let rm_bits = reg_to_bits(rm) as u16;
2657
2658                    if rd_bits < 8 && rm_bits < 8 {
2659                        // MVNS Rd, Rm (16-bit): 0100 0011 11 Rm Rd
2660                        let instr: u16 = 0x43C0 | (rm_bits << 3) | rd_bits;
2661                        Ok(instr.to_le_bytes().to_vec())
2662                    } else {
2663                        // 32-bit MVN
2664                        let hw1: u16 = 0xEA6F_u16;
2665                        let hw2: u16 = ((reg_to_bits(rd) << 8) | reg_to_bits(rm)) as u16;
2666                        let mut bytes = hw1.to_le_bytes().to_vec();
2667                        bytes.extend_from_slice(&hw2.to_le_bytes());
2668                        Ok(bytes)
2669                    }
2670                } else {
2671                    let instr: u16 = 0xBF00;
2672                    Ok(instr.to_le_bytes().to_vec())
2673                }
2674            }
2675
2676            // MOVW - Move Wide (Thumb-2 32-bit)
2677            ArmOp::Movw { rd, imm16 } => {
2678                self.encode_thumb32_movw_raw(reg_to_bits(rd), *imm16 as u32)
2679            }
2680
2681            // MOVT - Move Top (Thumb-2 32-bit)
2682            ArmOp::Movt { rd, imm16 } => {
2683                self.encode_thumb32_movt_raw(reg_to_bits(rd), *imm16 as u32)
2684            }
2685
2686            // SetCond: Materialize condition flag into register (0 or 1)
2687            // Strategy: ITE <cond>; MOV Rd, #1; MOV Rd, #0
2688            // IMPORTANT: Must use ITE (If-Then-Else) because 16-bit Thumb MOV
2689            // always sets flags (MOVS). We need to evaluate the condition BEFORE
2690            // any MOV instruction clobbers the flags from CMP.
2691            ArmOp::SetCond { rd, cond } => {
2692                let rd_bits = reg_to_bits(rd) as u16;
2693
2694                // Condition code encoding for IT block
2695                use synth_synthesis::Condition;
2696                let cond_bits: u16 = match cond {
2697                    Condition::EQ => 0x0,
2698                    Condition::NE => 0x1,
2699                    Condition::LT => 0xB,
2700                    Condition::LE => 0xD,
2701                    Condition::GT => 0xC,
2702                    Condition::GE => 0xA,
2703                    Condition::LO => 0x3, // CC/LO (unsigned <)
2704                    Condition::LS => 0x9, // LS (unsigned <=)
2705                    Condition::HI => 0x8, // HI (unsigned >)
2706                    Condition::HS => 0x2, // CS/HS (unsigned >=)
2707                };
2708
2709                // ITE <cond>: encodes If-Then-Else block
2710                // The mask field depends on firstcond[0]:
2711                // - If firstcond[0] = 0: mask = 0xC for TE pattern (ITE EQ = BF0C)
2712                // - If firstcond[0] = 1: mask = 0x4 for TE pattern (ITE NE = BF14)
2713                let mask = if (cond_bits & 1) == 0 { 0xC } else { 0x4 };
2714                let ite_instr: u16 = 0xBF00 | (cond_bits << 4) | mask;
2715
2716                // Materialize 0/1 into Rd. The 16-bit MOVS (T1) encodes Rd in a
2717                // 3-bit field (bits[10:8]) — only R0–R7. For a high register
2718                // (R8–R12) `rd_bits << 8` overflows into bit 11 and silently
2719                // turns MOVS into CMP (00100 → 00101), corrupting the result
2720                // (this mis-materialized gale's `has_waiter`, so its `local.set`
2721                // stored a stale register → the binary-sem WAKE dispatch read
2722                // garbage). Use the 32-bit MOV.W (T2) for high registers, which
2723                // has a 4-bit Rd field. MOV.W with S=0 doesn't set flags, which
2724                // is fine inside the ITE (the materialized value is the result;
2725                // the flags are not consumed afterwards).
2726                let mut bytes = ite_instr.to_le_bytes().to_vec();
2727                let push_mov = |bytes: &mut Vec<u8>, imm: u16| {
2728                    if rd_bits <= 7 {
2729                        let m: u16 = 0x2000 | (rd_bits << 8) | imm; // 16-bit MOVS Rd,#imm
2730                        bytes.extend_from_slice(&m.to_le_bytes());
2731                    } else {
2732                        // 32-bit MOV.W Rd, #imm (T2): F04F | (Rd<<8) | imm8
2733                        let hw1: u16 = 0xF04F;
2734                        let hw2: u16 = (rd_bits << 8) | imm;
2735                        bytes.extend_from_slice(&hw1.to_le_bytes());
2736                        bytes.extend_from_slice(&hw2.to_le_bytes());
2737                    }
2738                };
2739                push_mov(&mut bytes, 1); // Then branch (condition true)  → 1
2740                push_mov(&mut bytes, 0); // Else branch (condition false) → 0
2741                Ok(bytes)
2742            }
2743
2744            // I64SetCond: Compare two i64 register pairs, result 0/1 in rd
2745            // EQ/NE: CMP lo,lo; IT EQ; CMPEQ hi,hi; ITE <cond>; MOV 1; MOV 0
2746            // LT: CMP lo,lo; SBCS rd,hi,hi; ITE LT; MOV 1; MOV 0
2747            // GT: CMP lo,lo (swapped); SBCS rd,hi,hi (swapped); ITE LT; MOV 1; MOV 0
2748            ArmOp::I64SetCond {
2749                rd,
2750                rn_lo,
2751                rn_hi,
2752                rm_lo,
2753                rm_hi,
2754                cond,
2755            } => {
2756                use synth_synthesis::Condition;
2757                let rd_bits = reg_to_bits(rd) as u16;
2758                let mut bytes = Vec::new();
2759
2760                // Helper: encode CMP Rn, Rm (16-bit)
2761                let encode_cmp_reg = |rn: &synth_synthesis::Reg,
2762                                      rm: &synth_synthesis::Reg|
2763                 -> Vec<u8> {
2764                    let rn_bits = reg_to_bits(rn) as u16;
2765                    let rm_bits = reg_to_bits(rm) as u16;
2766                    if rn_bits < 8 && rm_bits < 8 {
2767                        let instr: u16 = 0x4280 | (rm_bits << 3) | rn_bits;
2768                        instr.to_le_bytes().to_vec()
2769                    } else {
2770                        let n_bit = (rn_bits >> 3) & 1;
2771                        let instr: u16 = 0x4500 | (n_bit << 7) | (rm_bits << 3) | (rn_bits & 0x7);
2772                        instr.to_le_bytes().to_vec()
2773                    }
2774                };
2775
2776                // Helper: encode ITE <cond> (2 bytes)
2777                let encode_ite = |cond_bits: u16| -> Vec<u8> {
2778                    let mask = if (cond_bits & 1) == 0 { 0xC } else { 0x4 };
2779                    let ite_instr: u16 = 0xBF00 | (cond_bits << 4) | mask;
2780                    ite_instr.to_le_bytes().to_vec()
2781                };
2782
2783                // Helper: encode SetCond (ITE + MOV #1 + MOV #0) for given condition
2784                let encode_setcond = |cond_bits: u16, rd_bits: u16| -> Vec<u8> {
2785                    let mut b = encode_ite(cond_bits);
2786                    let mov_one: u16 = 0x2001 | (rd_bits << 8);
2787                    let mov_zero: u16 = 0x2000 | (rd_bits << 8);
2788                    b.extend_from_slice(&mov_one.to_le_bytes());
2789                    b.extend_from_slice(&mov_zero.to_le_bytes());
2790                    b
2791                };
2792
2793                match cond {
2794                    Condition::EQ | Condition::NE => {
2795                        // CMP rn_lo, rm_lo (compare low words)
2796                        bytes.extend_from_slice(&encode_cmp_reg(rn_lo, rm_lo));
2797
2798                        // IT EQ (execute next instruction only if Z=1)
2799                        let it_eq: u16 = 0xBF08; // IT EQ: cond=0000, mask=1000
2800                        bytes.extend_from_slice(&it_eq.to_le_bytes());
2801
2802                        // CMPEQ rn_hi, rm_hi (compare high words, only if low equal)
2803                        bytes.extend_from_slice(&encode_cmp_reg(rn_hi, rm_hi));
2804
2805                        // ITE <cond>; MOV rd, #1; MOV rd, #0
2806                        let cond_bits: u16 = match cond {
2807                            Condition::EQ => 0x0,
2808                            Condition::NE => 0x1,
2809                            _ => unreachable!(),
2810                        };
2811                        bytes.extend_from_slice(&encode_setcond(cond_bits, rd_bits));
2812                    }
2813
2814                    Condition::LT => {
2815                        // CMP rn_lo, rm_lo (sets C flag for borrow)
2816                        bytes.extend_from_slice(&encode_cmp_reg(rn_lo, rm_lo));
2817
2818                        // SBCS rd, rn_hi, rm_hi (subtract with carry, sets N,V flags)
2819                        // SBCS.W Rd, Rn, Rm: EB70 Rn | 0000 Rd 0000 Rm
2820                        let rn_hi_bits = reg_to_bits(rn_hi);
2821                        let rm_hi_bits = reg_to_bits(rm_hi);
2822                        let hw1: u16 = (0xEB70 | rn_hi_bits) as u16;
2823                        let hw2: u16 = ((rd_bits as u32) << 8 | rm_hi_bits) as u16;
2824                        bytes.extend_from_slice(&hw1.to_le_bytes());
2825                        bytes.extend_from_slice(&hw2.to_le_bytes());
2826
2827                        // ITE LT; MOV rd, #1; MOV rd, #0
2828                        bytes.extend_from_slice(&encode_setcond(0xB, rd_bits)); // LT = 0xB
2829                    }
2830
2831                    Condition::GT => {
2832                        // GT(a,b) = LT(b,a): swap operands
2833                        // CMP rm_lo, rn_lo (swapped)
2834                        bytes.extend_from_slice(&encode_cmp_reg(rm_lo, rn_lo));
2835
2836                        // SBCS rd, rm_hi, rn_hi (swapped)
2837                        let rm_hi_bits = reg_to_bits(rm_hi);
2838                        let rn_hi_bits = reg_to_bits(rn_hi);
2839                        let hw1: u16 = (0xEB70 | rm_hi_bits) as u16;
2840                        let hw2: u16 = ((rd_bits as u32) << 8 | rn_hi_bits) as u16;
2841                        bytes.extend_from_slice(&hw1.to_le_bytes());
2842                        bytes.extend_from_slice(&hw2.to_le_bytes());
2843
2844                        // ITE LT; MOV rd, #1; MOV rd, #0
2845                        bytes.extend_from_slice(&encode_setcond(0xB, rd_bits)); // LT = 0xB
2846                    }
2847
2848                    Condition::LE => {
2849                        // LE(a,b) = !GT(a,b): use GT logic but invert result
2850                        // GT(a,b) = LT(b,a): so we do CMP(b,a) and check LT, then invert
2851                        // CMP rm_lo, rn_lo (swapped, same as GT)
2852                        bytes.extend_from_slice(&encode_cmp_reg(rm_lo, rn_lo));
2853
2854                        // SBCS rd, rm_hi, rn_hi (swapped)
2855                        let rm_hi_bits = reg_to_bits(rm_hi);
2856                        let rn_hi_bits = reg_to_bits(rn_hi);
2857                        let hw1: u16 = (0xEB70 | rm_hi_bits) as u16;
2858                        let hw2: u16 = ((rd_bits as u32) << 8 | rn_hi_bits) as u16;
2859                        bytes.extend_from_slice(&hw1.to_le_bytes());
2860                        bytes.extend_from_slice(&hw2.to_le_bytes());
2861
2862                        // ITE GE; MOV rd, #1; MOV rd, #0 (GE is !LT, so inverting GT result)
2863                        bytes.extend_from_slice(&encode_setcond(0xA, rd_bits)); // GE = 0xA
2864                    }
2865
2866                    Condition::GE => {
2867                        // GE(a,b) = !LT(a,b): use LT logic but invert result
2868                        // CMP rn_lo, rm_lo (same as LT)
2869                        bytes.extend_from_slice(&encode_cmp_reg(rn_lo, rm_lo));
2870
2871                        // SBCS rd, rn_hi, rm_hi (same as LT)
2872                        let rn_hi_bits = reg_to_bits(rn_hi);
2873                        let rm_hi_bits = reg_to_bits(rm_hi);
2874                        let hw1: u16 = (0xEB70 | rn_hi_bits) as u16;
2875                        let hw2: u16 = ((rd_bits as u32) << 8 | rm_hi_bits) as u16;
2876                        bytes.extend_from_slice(&hw1.to_le_bytes());
2877                        bytes.extend_from_slice(&hw2.to_le_bytes());
2878
2879                        // ITE GE; MOV rd, #1; MOV rd, #0 (GE is !LT)
2880                        bytes.extend_from_slice(&encode_setcond(0xA, rd_bits)); // GE = 0xA
2881                    }
2882
2883                    // Unsigned comparisons - same instruction sequence, different conditions
2884                    Condition::LO => {
2885                        // LO (unsigned LT): CMP lo, SBCS hi, check C=0
2886                        bytes.extend_from_slice(&encode_cmp_reg(rn_lo, rm_lo));
2887                        let rn_hi_bits = reg_to_bits(rn_hi);
2888                        let rm_hi_bits = reg_to_bits(rm_hi);
2889                        let hw1: u16 = (0xEB70 | rn_hi_bits) as u16;
2890                        let hw2: u16 = ((rd_bits as u32) << 8 | rm_hi_bits) as u16;
2891                        bytes.extend_from_slice(&hw1.to_le_bytes());
2892                        bytes.extend_from_slice(&hw2.to_le_bytes());
2893                        bytes.extend_from_slice(&encode_setcond(0x3, rd_bits)); // LO = 0x3 (CC)
2894                    }
2895
2896                    Condition::HI => {
2897                        // HI (unsigned GT): swap operands and check LO
2898                        bytes.extend_from_slice(&encode_cmp_reg(rm_lo, rn_lo));
2899                        let rm_hi_bits = reg_to_bits(rm_hi);
2900                        let rn_hi_bits = reg_to_bits(rn_hi);
2901                        let hw1: u16 = (0xEB70 | rm_hi_bits) as u16;
2902                        let hw2: u16 = ((rd_bits as u32) << 8 | rn_hi_bits) as u16;
2903                        bytes.extend_from_slice(&hw1.to_le_bytes());
2904                        bytes.extend_from_slice(&hw2.to_le_bytes());
2905                        bytes.extend_from_slice(&encode_setcond(0x3, rd_bits)); // LO = 0x3 (CC)
2906                    }
2907
2908                    Condition::LS => {
2909                        // LS (unsigned LE): !(a > b) = !(HI), so do HI and invert
2910                        bytes.extend_from_slice(&encode_cmp_reg(rm_lo, rn_lo));
2911                        let rm_hi_bits = reg_to_bits(rm_hi);
2912                        let rn_hi_bits = reg_to_bits(rn_hi);
2913                        let hw1: u16 = (0xEB70 | rm_hi_bits) as u16;
2914                        let hw2: u16 = ((rd_bits as u32) << 8 | rn_hi_bits) as u16;
2915                        bytes.extend_from_slice(&hw1.to_le_bytes());
2916                        bytes.extend_from_slice(&hw2.to_le_bytes());
2917                        bytes.extend_from_slice(&encode_setcond(0x2, rd_bits)); // HS = 0x2 (CS) = !LO
2918                    }
2919
2920                    Condition::HS => {
2921                        // HS (unsigned GE): !(a < b) = !(LO)
2922                        bytes.extend_from_slice(&encode_cmp_reg(rn_lo, rm_lo));
2923                        let rn_hi_bits = reg_to_bits(rn_hi);
2924                        let rm_hi_bits = reg_to_bits(rm_hi);
2925                        let hw1: u16 = (0xEB70 | rn_hi_bits) as u16;
2926                        let hw2: u16 = ((rd_bits as u32) << 8 | rm_hi_bits) as u16;
2927                        bytes.extend_from_slice(&hw1.to_le_bytes());
2928                        bytes.extend_from_slice(&hw2.to_le_bytes());
2929                        bytes.extend_from_slice(&encode_setcond(0x2, rd_bits)); // HS = 0x2 (CS) = !LO
2930                    }
2931                }
2932
2933                Ok(bytes)
2934            }
2935
2936            // I64SetCondZ: Test if i64 register pair is zero, result 0/1 in rd
2937            // ORR.W rd, rn_lo, rn_hi; CMP rd, #0; ITE EQ; MOV 1; MOV 0
2938            ArmOp::I64SetCondZ { rd, rn_lo, rn_hi } => {
2939                let rd_bits = reg_to_bits(rd);
2940                let rn_lo_bits = reg_to_bits(rn_lo);
2941                let rn_hi_bits = reg_to_bits(rn_hi);
2942                let mut bytes = Vec::new();
2943
2944                // ORR.W rd, rn_lo, rn_hi: EA40 rn_lo | 0000 rd 0000 rn_hi
2945                let hw1: u16 = (0xEA40 | rn_lo_bits) as u16;
2946                let hw2: u16 = ((rd_bits << 8) | rn_hi_bits) as u16;
2947                bytes.extend_from_slice(&hw1.to_le_bytes());
2948                bytes.extend_from_slice(&hw2.to_le_bytes());
2949
2950                // CMP rd, #0 (16-bit): 0010 1 Rd 0000 0000
2951                let cmp_instr: u16 = 0x2800 | ((rd_bits as u16) << 8);
2952                bytes.extend_from_slice(&cmp_instr.to_le_bytes());
2953
2954                // ITE EQ; MOV rd, #1; MOV rd, #0
2955                let mask = 0xC_u16; // ITE EQ mask: firstcond[0]=0, mask=0xC
2956                let ite_instr: u16 = 0xBF00 | mask;
2957                bytes.extend_from_slice(&ite_instr.to_le_bytes());
2958                let mov_one: u16 = 0x2001 | ((rd_bits as u16) << 8);
2959                let mov_zero: u16 = 0x2000 | ((rd_bits as u16) << 8);
2960                bytes.extend_from_slice(&mov_one.to_le_bytes());
2961                bytes.extend_from_slice(&mov_zero.to_le_bytes());
2962
2963                Ok(bytes)
2964            }
2965
2966            // I64Mul: 64-bit multiply using UMULL + MLA cross products
2967            // Formula: result = (a_lo * b_lo) + ((a_lo * b_hi + a_hi * b_lo) << 32)
2968            // Uses R12 as scratch register
2969            ArmOp::I64Mul {
2970                rd_lo,
2971                rd_hi,
2972                rn_lo,
2973                rn_hi,
2974                rm_lo,
2975                rm_hi,
2976            } => {
2977                let rd_lo_bits = reg_to_bits(rd_lo);
2978                let rd_hi_bits = reg_to_bits(rd_hi);
2979                let rn_lo_bits = reg_to_bits(rn_lo);
2980                let rn_hi_bits = reg_to_bits(rn_hi);
2981                let rm_lo_bits = reg_to_bits(rm_lo);
2982                let rm_hi_bits = reg_to_bits(rm_hi);
2983                let r12: u32 = 12; // IP scratch register
2984                let mut bytes = Vec::new();
2985
2986                // 1. MUL R12, rn_lo, rm_hi  (R12 = a_lo * b_hi)
2987                // Thumb-2 MUL: hw1=0xFB00|Rn, hw2=0xF000|(Rd<<8)|Rm
2988                let hw1: u16 = (0xFB00 | rn_lo_bits) as u16;
2989                let hw2: u16 = (0xF000 | (r12 << 8) | rm_hi_bits) as u16;
2990                bytes.extend_from_slice(&hw1.to_le_bytes());
2991                bytes.extend_from_slice(&hw2.to_le_bytes());
2992
2993                // 2. MLA R12, rn_hi, rm_lo, R12  (R12 += a_hi * b_lo)
2994                // Thumb-2 MLA: hw1=0xFB00|Rn, hw2=(Ra<<12)|(Rd<<8)|Rm
2995                let hw1: u16 = (0xFB00 | rn_hi_bits) as u16;
2996                let hw2: u16 = ((r12 << 12) | (r12 << 8) | rm_lo_bits) as u16;
2997                bytes.extend_from_slice(&hw1.to_le_bytes());
2998                bytes.extend_from_slice(&hw2.to_le_bytes());
2999
3000                // 3. UMULL rd_lo, rd_hi, rn_lo, rm_lo  (rd_lo:rd_hi = a_lo * b_lo)
3001                // Thumb-2 UMULL: hw1=0xFBA0|Rn, hw2=(RdLo<<12)|(RdHi<<8)|Rm
3002                let hw1: u16 = (0xFBA0 | rn_lo_bits) as u16;
3003                let hw2: u16 = ((rd_lo_bits << 12) | (rd_hi_bits << 8) | rm_lo_bits) as u16;
3004                bytes.extend_from_slice(&hw1.to_le_bytes());
3005                bytes.extend_from_slice(&hw2.to_le_bytes());
3006
3007                // 4. ADD rd_hi, R12  (rd_hi += cross products)
3008                // 16-bit high reg ADD: 01000100 D Rm Rdn[2:0]
3009                let d_bit = (rd_hi_bits >> 3) & 1;
3010                let add_instr: u16 =
3011                    (0x4400 | (d_bit << 7) | (r12 << 3) | (rd_hi_bits & 0x7)) as u16;
3012                bytes.extend_from_slice(&add_instr.to_le_bytes());
3013
3014                Ok(bytes)
3015            }
3016
3017            // I64Shl: 64-bit shift left with branch for n<32 vs n>=32
3018            // rm_hi (R3) is used as temp register
3019            ArmOp::I64Shl {
3020                rd_lo,
3021                rd_hi,
3022                rn_lo,
3023                rn_hi,
3024                rm_lo,
3025                rm_hi,
3026            } => {
3027                let rd_lo_bits = reg_to_bits(rd_lo);
3028                let rd_hi_bits = reg_to_bits(rd_hi);
3029                let rn_lo_bits = reg_to_bits(rn_lo);
3030                let rn_hi_bits = reg_to_bits(rn_hi);
3031                let rm_lo_bits = reg_to_bits(rm_lo);
3032                let rm_hi_bits = reg_to_bits(rm_hi); // temp
3033                let mut bytes = Vec::new();
3034
3035                // AND.W rm_lo, rm_lo, #63  (mask shift amount to 6 bits)
3036                let hw1: u16 = (0xF000 | rm_lo_bits) as u16;
3037                let hw2: u16 = ((rm_lo_bits << 8) | 0x3F) as u16;
3038                bytes.extend_from_slice(&hw1.to_le_bytes());
3039                bytes.extend_from_slice(&hw2.to_le_bytes());
3040
3041                // SUBS.W rm_hi, rm_lo, #32  (rm_hi = n-32, sets flags)
3042                let hw1: u16 = (0xF1B0 | rm_lo_bits) as u16;
3043                let hw2: u16 = ((rm_hi_bits << 8) | 0x20) as u16;
3044                bytes.extend_from_slice(&hw1.to_le_bytes());
3045                bytes.extend_from_slice(&hw2.to_le_bytes());
3046
3047                // BPL .large (branch if n >= 32, offset = +10 halfwords)
3048                let bpl: u16 = 0xD50A;
3049                bytes.extend_from_slice(&bpl.to_le_bytes());
3050
3051                // --- Small shift (n < 32) ---
3052                // RSB.W rm_hi, rm_lo, #32  (rm_hi = 32-n)
3053                let hw1: u16 = (0xF1C0 | rm_lo_bits) as u16;
3054                let hw2: u16 = ((rm_hi_bits << 8) | 0x20) as u16;
3055                bytes.extend_from_slice(&hw1.to_le_bytes());
3056                bytes.extend_from_slice(&hw2.to_le_bytes());
3057
3058                // LSR.W rm_hi, rn_lo, rm_hi  (rm_hi = lo >> (32-n), overflow bits)
3059                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3060                let hw2: u16 = (0xF000 | (rm_hi_bits << 8) | rm_hi_bits) as u16;
3061                bytes.extend_from_slice(&hw1.to_le_bytes());
3062                bytes.extend_from_slice(&hw2.to_le_bytes());
3063
3064                // LSL.W rd_hi, rn_hi, rm_lo  (hi <<= n)
3065                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3066                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | rm_lo_bits) as u16;
3067                bytes.extend_from_slice(&hw1.to_le_bytes());
3068                bytes.extend_from_slice(&hw2.to_le_bytes());
3069
3070                // ORR.W rd_hi, rd_hi, rm_hi  (hi |= overflow bits from lo)
3071                let hw1: u16 = (0xEA40 | rd_hi_bits) as u16;
3072                let hw2: u16 = ((rd_hi_bits << 8) | rm_hi_bits) as u16;
3073                bytes.extend_from_slice(&hw1.to_le_bytes());
3074                bytes.extend_from_slice(&hw2.to_le_bytes());
3075
3076                // LSL.W rd_lo, rn_lo, rm_lo  (lo <<= n)
3077                let hw1: u16 = (0xFA00 | rn_lo_bits) as u16;
3078                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | rm_lo_bits) as u16;
3079                bytes.extend_from_slice(&hw1.to_le_bytes());
3080                bytes.extend_from_slice(&hw2.to_le_bytes());
3081
3082                // B .done (skip large shift: +2 halfwords)
3083                let b_done: u16 = 0xE002;
3084                bytes.extend_from_slice(&b_done.to_le_bytes());
3085
3086                // --- Large shift (n >= 32) ---
3087                // LSL.W rd_hi, rn_lo, rm_hi  (hi = lo << (n-32))
3088                let hw1: u16 = (0xFA00 | rn_lo_bits) as u16;
3089                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | rm_hi_bits) as u16;
3090                bytes.extend_from_slice(&hw1.to_le_bytes());
3091                bytes.extend_from_slice(&hw2.to_le_bytes());
3092
3093                // MOV rd_lo, #0
3094                let mov_zero: u16 = 0x2000 | ((rd_lo_bits as u16) << 8);
3095                bytes.extend_from_slice(&mov_zero.to_le_bytes());
3096
3097                Ok(bytes) // Total: 38 bytes
3098            }
3099
3100            // I64ShrU: 64-bit logical shift right with branch for n<32 vs n>=32
3101            ArmOp::I64ShrU {
3102                rd_lo,
3103                rd_hi,
3104                rn_lo,
3105                rn_hi,
3106                rm_lo,
3107                rm_hi,
3108            } => {
3109                let rd_lo_bits = reg_to_bits(rd_lo);
3110                let rd_hi_bits = reg_to_bits(rd_hi);
3111                let rn_lo_bits = reg_to_bits(rn_lo);
3112                let rn_hi_bits = reg_to_bits(rn_hi);
3113                let rm_lo_bits = reg_to_bits(rm_lo);
3114                let rm_hi_bits = reg_to_bits(rm_hi); // temp
3115                let mut bytes = Vec::new();
3116
3117                // AND.W rm_lo, rm_lo, #63
3118                let hw1: u16 = (0xF000 | rm_lo_bits) as u16;
3119                let hw2: u16 = ((rm_lo_bits << 8) | 0x3F) as u16;
3120                bytes.extend_from_slice(&hw1.to_le_bytes());
3121                bytes.extend_from_slice(&hw2.to_le_bytes());
3122
3123                // SUBS.W rm_hi, rm_lo, #32
3124                let hw1: u16 = (0xF1B0 | rm_lo_bits) as u16;
3125                let hw2: u16 = ((rm_hi_bits << 8) | 0x20) as u16;
3126                bytes.extend_from_slice(&hw1.to_le_bytes());
3127                bytes.extend_from_slice(&hw2.to_le_bytes());
3128
3129                // BPL .large (+10 halfwords)
3130                let bpl: u16 = 0xD50A;
3131                bytes.extend_from_slice(&bpl.to_le_bytes());
3132
3133                // --- Small shift (n < 32) ---
3134                // RSB.W rm_hi, rm_lo, #32  (rm_hi = 32-n)
3135                let hw1: u16 = (0xF1C0 | rm_lo_bits) as u16;
3136                let hw2: u16 = ((rm_hi_bits << 8) | 0x20) as u16;
3137                bytes.extend_from_slice(&hw1.to_le_bytes());
3138                bytes.extend_from_slice(&hw2.to_le_bytes());
3139
3140                // LSL.W rm_hi, rn_hi, rm_hi  (rm_hi = hi << (32-n), bits flowing to lo)
3141                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3142                let hw2: u16 = (0xF000 | (rm_hi_bits << 8) | rm_hi_bits) as u16;
3143                bytes.extend_from_slice(&hw1.to_le_bytes());
3144                bytes.extend_from_slice(&hw2.to_le_bytes());
3145
3146                // LSR.W rd_lo, rn_lo, rm_lo  (lo >>= n)
3147                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3148                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | rm_lo_bits) as u16;
3149                bytes.extend_from_slice(&hw1.to_le_bytes());
3150                bytes.extend_from_slice(&hw2.to_le_bytes());
3151
3152                // ORR.W rd_lo, rd_lo, rm_hi  (lo |= overflow from hi)
3153                let hw1: u16 = (0xEA40 | rd_lo_bits) as u16;
3154                let hw2: u16 = ((rd_lo_bits << 8) | rm_hi_bits) as u16;
3155                bytes.extend_from_slice(&hw1.to_le_bytes());
3156                bytes.extend_from_slice(&hw2.to_le_bytes());
3157
3158                // LSR.W rd_hi, rn_hi, rm_lo  (hi >>= n, logical)
3159                let hw1: u16 = (0xFA20 | rn_hi_bits) as u16;
3160                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | rm_lo_bits) as u16;
3161                bytes.extend_from_slice(&hw1.to_le_bytes());
3162                bytes.extend_from_slice(&hw2.to_le_bytes());
3163
3164                // B .done (+2 halfwords)
3165                let b_done: u16 = 0xE002;
3166                bytes.extend_from_slice(&b_done.to_le_bytes());
3167
3168                // --- Large shift (n >= 32) ---
3169                // LSR.W rd_lo, rn_hi, rm_hi  (lo = hi >> (n-32))
3170                let hw1: u16 = (0xFA20 | rn_hi_bits) as u16;
3171                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | rm_hi_bits) as u16;
3172                bytes.extend_from_slice(&hw1.to_le_bytes());
3173                bytes.extend_from_slice(&hw2.to_le_bytes());
3174
3175                // MOV rd_hi, #0
3176                let mov_zero: u16 = 0x2000 | ((rd_hi_bits as u16) << 8);
3177                bytes.extend_from_slice(&mov_zero.to_le_bytes());
3178
3179                Ok(bytes) // Total: 38 bytes
3180            }
3181
3182            // I64ShrS: 64-bit arithmetic shift right with branch for n<32 vs n>=32
3183            ArmOp::I64ShrS {
3184                rd_lo,
3185                rd_hi,
3186                rn_lo,
3187                rn_hi,
3188                rm_lo,
3189                rm_hi,
3190            } => {
3191                let rd_lo_bits = reg_to_bits(rd_lo);
3192                let rd_hi_bits = reg_to_bits(rd_hi);
3193                let rn_lo_bits = reg_to_bits(rn_lo);
3194                let rn_hi_bits = reg_to_bits(rn_hi);
3195                let rm_lo_bits = reg_to_bits(rm_lo);
3196                let rm_hi_bits = reg_to_bits(rm_hi); // temp
3197                let mut bytes = Vec::new();
3198
3199                // AND.W rm_lo, rm_lo, #63
3200                let hw1: u16 = (0xF000 | rm_lo_bits) as u16;
3201                let hw2: u16 = ((rm_lo_bits << 8) | 0x3F) as u16;
3202                bytes.extend_from_slice(&hw1.to_le_bytes());
3203                bytes.extend_from_slice(&hw2.to_le_bytes());
3204
3205                // SUBS.W rm_hi, rm_lo, #32
3206                let hw1: u16 = (0xF1B0 | rm_lo_bits) as u16;
3207                let hw2: u16 = ((rm_hi_bits << 8) | 0x20) as u16;
3208                bytes.extend_from_slice(&hw1.to_le_bytes());
3209                bytes.extend_from_slice(&hw2.to_le_bytes());
3210
3211                // BPL .large (+10 halfwords)
3212                let bpl: u16 = 0xD50A;
3213                bytes.extend_from_slice(&bpl.to_le_bytes());
3214
3215                // --- Small shift (n < 32) ---
3216                // RSB.W rm_hi, rm_lo, #32
3217                let hw1: u16 = (0xF1C0 | rm_lo_bits) as u16;
3218                let hw2: u16 = ((rm_hi_bits << 8) | 0x20) as u16;
3219                bytes.extend_from_slice(&hw1.to_le_bytes());
3220                bytes.extend_from_slice(&hw2.to_le_bytes());
3221
3222                // LSL.W rm_hi, rn_hi, rm_hi  (rm_hi = hi << (32-n), bits flowing to lo)
3223                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3224                let hw2: u16 = (0xF000 | (rm_hi_bits << 8) | rm_hi_bits) as u16;
3225                bytes.extend_from_slice(&hw1.to_le_bytes());
3226                bytes.extend_from_slice(&hw2.to_le_bytes());
3227
3228                // LSR.W rd_lo, rn_lo, rm_lo  (lo >>= n, logical for lo word)
3229                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3230                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | rm_lo_bits) as u16;
3231                bytes.extend_from_slice(&hw1.to_le_bytes());
3232                bytes.extend_from_slice(&hw2.to_le_bytes());
3233
3234                // ORR.W rd_lo, rd_lo, rm_hi  (lo |= overflow from hi)
3235                let hw1: u16 = (0xEA40 | rd_lo_bits) as u16;
3236                let hw2: u16 = ((rd_lo_bits << 8) | rm_hi_bits) as u16;
3237                bytes.extend_from_slice(&hw1.to_le_bytes());
3238                bytes.extend_from_slice(&hw2.to_le_bytes());
3239
3240                // ASR.W rd_hi, rn_hi, rm_lo  (hi >>= n, arithmetic/sign-extending)
3241                let hw1: u16 = (0xFA40 | rn_hi_bits) as u16;
3242                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | rm_lo_bits) as u16;
3243                bytes.extend_from_slice(&hw1.to_le_bytes());
3244                bytes.extend_from_slice(&hw2.to_le_bytes());
3245
3246                // B .done (+3 halfwords, large shift is 8 bytes)
3247                let b_done: u16 = 0xE003;
3248                bytes.extend_from_slice(&b_done.to_le_bytes());
3249
3250                // --- Large shift (n >= 32) ---
3251                // ASR.W rd_lo, rn_hi, rm_hi  (lo = hi >>> (n-32))
3252                let hw1: u16 = (0xFA40 | rn_hi_bits) as u16;
3253                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | rm_hi_bits) as u16;
3254                bytes.extend_from_slice(&hw1.to_le_bytes());
3255                bytes.extend_from_slice(&hw2.to_le_bytes());
3256
3257                // ASR.W rd_hi, rn_hi, #31  (hi = sign extension, all 0s or all 1s)
3258                // Thumb-2 ASR immediate: hw1=0xEA4F, hw2=imm3:Rd:imm2:10:Rm
3259                // imm5=31=11111 → imm3=111, imm2=11
3260                let hw1: u16 = 0xEA4F;
3261                let hw2: u16 = (0x7000 | (rd_hi_bits << 8) | 0x00E0 | rn_hi_bits) as u16;
3262                bytes.extend_from_slice(&hw1.to_le_bytes());
3263                bytes.extend_from_slice(&hw2.to_le_bytes());
3264
3265                Ok(bytes) // Total: 40 bytes
3266            }
3267
3268            // I64Rotl: 64-bit rotate left
3269            // For n < 32: new_hi = (hi << n) | (lo >> (32-n)), new_lo = (lo << n) | (hi >> (32-n))
3270            // For n >= 32: same formula but with lo/hi conceptually swapped, shift by (n-32)
3271            // Uses R4 (saved/restored) and R12 as scratch
3272            ArmOp::I64Rotl {
3273                rdlo,
3274                rdhi,
3275                rnlo,
3276                rnhi,
3277                shift,
3278            } => {
3279                let rd_lo_bits = reg_to_bits(rdlo);
3280                let rd_hi_bits = reg_to_bits(rdhi);
3281                let rn_lo_bits = reg_to_bits(rnlo);
3282                let rn_hi_bits = reg_to_bits(rnhi);
3283                let shift_bits = reg_to_bits(shift);
3284                let r12: u32 = 12; // IP scratch
3285                let r3: u32 = 3; // Scratch (high word of shift amount, unused)
3286                let r4: u32 = 4; // Scratch (saved/restored)
3287                let mut bytes = Vec::new();
3288
3289                // PUSH {R4}
3290                bytes.extend_from_slice(&0xB410u16.to_le_bytes());
3291
3292                // AND.W shift, shift, #63 (mask to 6 bits)
3293                let hw1: u16 = (0xF000 | shift_bits) as u16;
3294                let hw2: u16 = ((shift_bits << 8) | 0x3F) as u16;
3295                bytes.extend_from_slice(&hw1.to_le_bytes());
3296                bytes.extend_from_slice(&hw2.to_le_bytes());
3297
3298                // SUBS.W R3, shift, #32 (R3 = n-32, sets flags)
3299                let hw1: u16 = (0xF1B0 | shift_bits) as u16;
3300                let hw2: u16 = ((r3 << 8) | 0x20) as u16;
3301                bytes.extend_from_slice(&hw1.to_le_bytes());
3302                bytes.extend_from_slice(&hw2.to_le_bytes());
3303
3304                // BPL .large (branch if n >= 32, offset = +14 halfwords)
3305                let bpl: u16 = 0xD50E;
3306                bytes.extend_from_slice(&bpl.to_le_bytes());
3307
3308                // === Small rotation (n < 32) ===
3309                // RSB.W R3, shift, #32 (R3 = 32-n)
3310                let hw1: u16 = (0xF1C0 | shift_bits) as u16;
3311                let hw2: u16 = ((r3 << 8) | 0x20) as u16;
3312                bytes.extend_from_slice(&hw1.to_le_bytes());
3313                bytes.extend_from_slice(&hw2.to_le_bytes());
3314
3315                // LSR.W R4, rn_lo, R3 (R4 = lo >> (32-n), will go to new_hi)
3316                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3317                let hw2: u16 = (0xF000 | (r4 << 8) | r3) as u16;
3318                bytes.extend_from_slice(&hw1.to_le_bytes());
3319                bytes.extend_from_slice(&hw2.to_le_bytes());
3320
3321                // LSR.W R12, rn_hi, R3 (R12 = hi >> (32-n), will go to new_lo)
3322                let hw1: u16 = (0xFA20 | rn_hi_bits) as u16;
3323                let hw2: u16 = (0xF000 | (r12 << 8) | r3) as u16;
3324                bytes.extend_from_slice(&hw1.to_le_bytes());
3325                bytes.extend_from_slice(&hw2.to_le_bytes());
3326
3327                // LSL.W rd_hi, rn_hi, shift (rd_hi = hi << n)
3328                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3329                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | shift_bits) as u16;
3330                bytes.extend_from_slice(&hw1.to_le_bytes());
3331                bytes.extend_from_slice(&hw2.to_le_bytes());
3332
3333                // ORR.W rd_hi, rd_hi, R4 (rd_hi = (hi << n) | (lo >> (32-n)))
3334                let hw1: u16 = (0xEA40 | rd_hi_bits) as u16;
3335                let hw2: u16 = ((rd_hi_bits << 8) | r4) as u16;
3336                bytes.extend_from_slice(&hw1.to_le_bytes());
3337                bytes.extend_from_slice(&hw2.to_le_bytes());
3338
3339                // LSL.W rd_lo, rn_lo, shift (rd_lo = lo << n)
3340                let hw1: u16 = (0xFA00 | rn_lo_bits) as u16;
3341                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | shift_bits) as u16;
3342                bytes.extend_from_slice(&hw1.to_le_bytes());
3343                bytes.extend_from_slice(&hw2.to_le_bytes());
3344
3345                // ORR.W rd_lo, rd_lo, R12 (rd_lo = (lo << n) | (hi >> (32-n)))
3346                let hw1: u16 = (0xEA40 | rd_lo_bits) as u16;
3347                let hw2: u16 = ((rd_lo_bits << 8) | r12) as u16;
3348                bytes.extend_from_slice(&hw1.to_le_bytes());
3349                bytes.extend_from_slice(&hw2.to_le_bytes());
3350
3351                // B .done (skip large block, offset = +14 halfwords)
3352                let b_done: u16 = 0xE00E;
3353                bytes.extend_from_slice(&b_done.to_le_bytes());
3354
3355                // === Large rotation (n >= 32) ===
3356                // R3 already has n-32 from the SUBS
3357                // RSB.W R4, R3, #32 (R4 = 32-(n-32) = 64-n)
3358                let hw1: u16 = (0xF1C0 | r3) as u16;
3359                let hw2: u16 = ((r4 << 8) | 0x20) as u16;
3360                bytes.extend_from_slice(&hw1.to_le_bytes());
3361                bytes.extend_from_slice(&hw2.to_le_bytes());
3362
3363                // LSR.W R12, rn_hi, R4 (R12 = hi >> (64-n), goes to new_hi low bits)
3364                let hw1: u16 = (0xFA20 | rn_hi_bits) as u16;
3365                let hw2: u16 = (0xF000 | (r12 << 8) | r4) as u16;
3366                bytes.extend_from_slice(&hw1.to_le_bytes());
3367                bytes.extend_from_slice(&hw2.to_le_bytes());
3368
3369                // LSR.W R4, rn_lo, R4 (R4 = lo >> (64-n), goes to new_lo low bits)
3370                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3371                let hw2: u16 = (0xF000 | (r4 << 8) | r4) as u16;
3372                bytes.extend_from_slice(&hw1.to_le_bytes());
3373                bytes.extend_from_slice(&hw2.to_le_bytes());
3374
3375                // LSL.W shift, rn_lo, R3 (shift = lo << (n-32), new_hi high bits)
3376                let hw1: u16 = (0xFA00 | rn_lo_bits) as u16;
3377                let hw2: u16 = (0xF000 | (shift_bits << 8) | r3) as u16;
3378                bytes.extend_from_slice(&hw1.to_le_bytes());
3379                bytes.extend_from_slice(&hw2.to_le_bytes());
3380
3381                // ORR.W shift, shift, R12 (shift = (lo << (n-32)) | (hi >> (64-n)) = new_hi)
3382                let hw1: u16 = (0xEA40 | shift_bits) as u16;
3383                let hw2: u16 = ((shift_bits << 8) | r12) as u16;
3384                bytes.extend_from_slice(&hw1.to_le_bytes());
3385                bytes.extend_from_slice(&hw2.to_le_bytes());
3386
3387                // LSL.W rd_lo, rn_hi, R3 (rd_lo = hi << (n-32), new_lo high bits)
3388                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3389                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | r3) as u16;
3390                bytes.extend_from_slice(&hw1.to_le_bytes());
3391                bytes.extend_from_slice(&hw2.to_le_bytes());
3392
3393                // ORR.W rd_lo, rd_lo, R4 (rd_lo = (hi << (n-32)) | (lo >> (64-n)) = new_lo)
3394                let hw1: u16 = (0xEA40 | rd_lo_bits) as u16;
3395                let hw2: u16 = ((rd_lo_bits << 8) | r4) as u16;
3396                bytes.extend_from_slice(&hw1.to_le_bytes());
3397                bytes.extend_from_slice(&hw2.to_le_bytes());
3398
3399                // MOV rd_hi, shift (rd_hi = new_hi)
3400                let d_bit = (rd_hi_bits >> 3) & 1;
3401                let mov_instr: u16 =
3402                    (0x4600 | (d_bit << 7) | (shift_bits << 3) | (rd_hi_bits & 0x7)) as u16;
3403                bytes.extend_from_slice(&mov_instr.to_le_bytes());
3404
3405                // POP {R4}
3406                bytes.extend_from_slice(&0xBC10u16.to_le_bytes());
3407
3408                Ok(bytes) // Total: 74 bytes
3409            }
3410
3411            // I64Rotr: 64-bit rotate right
3412            // rotr(x, n) = rotl(x, 64-n)
3413            // For n < 32: new_lo = (lo >> n) | (hi << (32-n)), new_hi = (hi >> n) | (lo << (32-n))
3414            // For n >= 32: same formula but with lo/hi swapped, shift by (n-32)
3415            ArmOp::I64Rotr {
3416                rdlo,
3417                rdhi,
3418                rnlo,
3419                rnhi,
3420                shift,
3421            } => {
3422                let rd_lo_bits = reg_to_bits(rdlo);
3423                let rd_hi_bits = reg_to_bits(rdhi);
3424                let rn_lo_bits = reg_to_bits(rnlo);
3425                let rn_hi_bits = reg_to_bits(rnhi);
3426                let shift_bits = reg_to_bits(shift);
3427                let r12: u32 = 12;
3428                let r3: u32 = 3;
3429                let r4: u32 = 4;
3430                let mut bytes = Vec::new();
3431
3432                // PUSH {R4}
3433                bytes.extend_from_slice(&0xB410u16.to_le_bytes());
3434
3435                // AND.W shift, shift, #63
3436                let hw1: u16 = (0xF000 | shift_bits) as u16;
3437                let hw2: u16 = ((shift_bits << 8) | 0x3F) as u16;
3438                bytes.extend_from_slice(&hw1.to_le_bytes());
3439                bytes.extend_from_slice(&hw2.to_le_bytes());
3440
3441                // SUBS.W R3, shift, #32
3442                let hw1: u16 = (0xF1B0 | shift_bits) as u16;
3443                let hw2: u16 = ((r3 << 8) | 0x20) as u16;
3444                bytes.extend_from_slice(&hw1.to_le_bytes());
3445                bytes.extend_from_slice(&hw2.to_le_bytes());
3446
3447                // BPL .large (+14 halfwords)
3448                let bpl: u16 = 0xD50E;
3449                bytes.extend_from_slice(&bpl.to_le_bytes());
3450
3451                // === Small rotation (n < 32) ===
3452                // RSB.W R3, shift, #32 (R3 = 32-n)
3453                let hw1: u16 = (0xF1C0 | shift_bits) as u16;
3454                let hw2: u16 = ((r3 << 8) | 0x20) as u16;
3455                bytes.extend_from_slice(&hw1.to_le_bytes());
3456                bytes.extend_from_slice(&hw2.to_le_bytes());
3457
3458                // LSL.W R4, rn_hi, R3 (R4 = hi << (32-n), will go to new_lo)
3459                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3460                let hw2: u16 = (0xF000 | (r4 << 8) | r3) as u16;
3461                bytes.extend_from_slice(&hw1.to_le_bytes());
3462                bytes.extend_from_slice(&hw2.to_le_bytes());
3463
3464                // LSL.W R12, rn_lo, R3 (R12 = lo << (32-n), will go to new_hi)
3465                let hw1: u16 = (0xFA00 | rn_lo_bits) as u16;
3466                let hw2: u16 = (0xF000 | (r12 << 8) | r3) as u16;
3467                bytes.extend_from_slice(&hw1.to_le_bytes());
3468                bytes.extend_from_slice(&hw2.to_le_bytes());
3469
3470                // LSR.W rd_lo, rn_lo, shift (rd_lo = lo >> n)
3471                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3472                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | shift_bits) as u16;
3473                bytes.extend_from_slice(&hw1.to_le_bytes());
3474                bytes.extend_from_slice(&hw2.to_le_bytes());
3475
3476                // ORR.W rd_lo, rd_lo, R4 (rd_lo = (lo >> n) | (hi << (32-n)))
3477                let hw1: u16 = (0xEA40 | rd_lo_bits) as u16;
3478                let hw2: u16 = ((rd_lo_bits << 8) | r4) as u16;
3479                bytes.extend_from_slice(&hw1.to_le_bytes());
3480                bytes.extend_from_slice(&hw2.to_le_bytes());
3481
3482                // LSR.W rd_hi, rn_hi, shift (rd_hi = hi >> n)
3483                let hw1: u16 = (0xFA20 | rn_hi_bits) as u16;
3484                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | shift_bits) as u16;
3485                bytes.extend_from_slice(&hw1.to_le_bytes());
3486                bytes.extend_from_slice(&hw2.to_le_bytes());
3487
3488                // ORR.W rd_hi, rd_hi, R12 (rd_hi = (hi >> n) | (lo << (32-n)))
3489                let hw1: u16 = (0xEA40 | rd_hi_bits) as u16;
3490                let hw2: u16 = ((rd_hi_bits << 8) | r12) as u16;
3491                bytes.extend_from_slice(&hw1.to_le_bytes());
3492                bytes.extend_from_slice(&hw2.to_le_bytes());
3493
3494                // B .done (+14 halfwords)
3495                let b_done: u16 = 0xE00E;
3496                bytes.extend_from_slice(&b_done.to_le_bytes());
3497
3498                // === Large rotation (n >= 32) ===
3499                // RSB.W R4, R3, #32 (R4 = 64-n)
3500                let hw1: u16 = (0xF1C0 | r3) as u16;
3501                let hw2: u16 = ((r4 << 8) | 0x20) as u16;
3502                bytes.extend_from_slice(&hw1.to_le_bytes());
3503                bytes.extend_from_slice(&hw2.to_le_bytes());
3504
3505                // LSL.W R12, rn_lo, R4 (R12 = lo << (64-n), goes to new_lo low bits)
3506                let hw1: u16 = (0xFA00 | rn_lo_bits) as u16;
3507                let hw2: u16 = (0xF000 | (r12 << 8) | r4) as u16;
3508                bytes.extend_from_slice(&hw1.to_le_bytes());
3509                bytes.extend_from_slice(&hw2.to_le_bytes());
3510
3511                // LSL.W R4, rn_hi, R4 (R4 = hi << (64-n), goes to new_hi low bits)
3512                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3513                let hw2: u16 = (0xF000 | (r4 << 8) | r4) as u16;
3514                bytes.extend_from_slice(&hw1.to_le_bytes());
3515                bytes.extend_from_slice(&hw2.to_le_bytes());
3516
3517                // LSR.W shift, rn_hi, R3 (shift = hi >> (n-32), new_lo high bits)
3518                let hw1: u16 = (0xFA20 | rn_hi_bits) as u16;
3519                let hw2: u16 = (0xF000 | (shift_bits << 8) | r3) as u16;
3520                bytes.extend_from_slice(&hw1.to_le_bytes());
3521                bytes.extend_from_slice(&hw2.to_le_bytes());
3522
3523                // ORR.W shift, shift, R12 (shift = (hi >> (n-32)) | (lo << (64-n)) = new_lo)
3524                let hw1: u16 = (0xEA40 | shift_bits) as u16;
3525                let hw2: u16 = ((shift_bits << 8) | r12) as u16;
3526                bytes.extend_from_slice(&hw1.to_le_bytes());
3527                bytes.extend_from_slice(&hw2.to_le_bytes());
3528
3529                // LSR.W rd_hi, rn_lo, R3 (rd_hi = lo >> (n-32), new_hi high bits)
3530                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3531                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | r3) as u16;
3532                bytes.extend_from_slice(&hw1.to_le_bytes());
3533                bytes.extend_from_slice(&hw2.to_le_bytes());
3534
3535                // ORR.W rd_hi, rd_hi, R4 (rd_hi = (lo >> (n-32)) | (hi << (64-n)) = new_hi)
3536                let hw1: u16 = (0xEA40 | rd_hi_bits) as u16;
3537                let hw2: u16 = ((rd_hi_bits << 8) | r4) as u16;
3538                bytes.extend_from_slice(&hw1.to_le_bytes());
3539                bytes.extend_from_slice(&hw2.to_le_bytes());
3540
3541                // MOV rd_lo, shift (rd_lo = new_lo)
3542                let d_bit = (rd_lo_bits >> 3) & 1;
3543                let mov_instr: u16 =
3544                    (0x4600 | (d_bit << 7) | (shift_bits << 3) | (rd_lo_bits & 0x7)) as u16;
3545                bytes.extend_from_slice(&mov_instr.to_le_bytes());
3546
3547                // POP {R4}
3548                bytes.extend_from_slice(&0xBC10u16.to_le_bytes());
3549
3550                Ok(bytes) // Total: 74 bytes
3551            }
3552
3553            // I64Clz: Count leading zeros in 64-bit value
3554            // If hi != 0: result = CLZ(hi)
3555            // If hi == 0: result = 32 + CLZ(lo)
3556            //
3557            // Layout (using CMP+BNE approach for consistency):
3558            // 0: CMP.W rnhi, #0 (4 bytes)
3559            // 4: BEQ .hi_zero (2 bytes) - branch forward to offset 14
3560            // 6: CLZ.W rd, rnhi (4 bytes)
3561            // 10: B .done (2 bytes) - branch forward to offset 22
3562            // 12: NOP (2 bytes) - padding for alignment
3563            // 14: .hi_zero: CLZ.W rd, rnlo (4 bytes)
3564            // 18: ADD.W rd, rd, #32 (4 bytes)
3565            // 22: .done
3566            ArmOp::I64Clz { rd, rnlo, rnhi } => {
3567                let rd_bits = reg_to_bits(rd);
3568                let rn_lo_bits = reg_to_bits(rnlo);
3569                let rn_hi_bits = reg_to_bits(rnhi);
3570                let mut bytes = Vec::new();
3571
3572                // CMP.W rnhi, #0 (4 bytes at offset 0)
3573                let hw1: u16 = (0xF1B0 | rn_hi_bits) as u16;
3574                let hw2: u16 = 0x0F00;
3575                bytes.extend_from_slice(&hw1.to_le_bytes());
3576                bytes.extend_from_slice(&hw2.to_le_bytes());
3577
3578                // BEQ .hi_zero (2 bytes at offset 4)
3579                // PC = 4 + 4 = 8, target = 14, offset = 6, imm8 = 3
3580                let beq: u16 = 0xD003;
3581                bytes.extend_from_slice(&beq.to_le_bytes());
3582
3583                // CLZ.W rd, rnhi (4 bytes at offset 6)
3584                // CLZ T1: hw1 = 0xFAB<Rm>, hw2 = 0xF<Rd>8<Rm>
3585                let hw1: u16 = (0xFAB0 | rn_hi_bits) as u16;
3586                let hw2: u16 = (0xF080 | (rd_bits << 8) | rn_hi_bits) as u16;
3587                bytes.extend_from_slice(&hw1.to_le_bytes());
3588                bytes.extend_from_slice(&hw2.to_le_bytes());
3589
3590                // B .done (2 bytes at offset 10)
3591                // PC = 10 + 4 = 14, target = 22, offset = 8, imm11 = 4
3592                let b_done: u16 = 0xE004;
3593                bytes.extend_from_slice(&b_done.to_le_bytes());
3594
3595                // NOP (2 bytes at offset 12) - padding
3596                bytes.extend_from_slice(&0xBF00u16.to_le_bytes());
3597
3598                // .hi_zero: (offset 14)
3599                // CLZ.W rd, rnlo (4 bytes)
3600                // CLZ T1: hw1 = 0xFAB<Rm>, hw2 = 0xF<Rd>8<Rm>
3601                let hw1: u16 = (0xFAB0 | rn_lo_bits) as u16;
3602                let hw2: u16 = (0xF080 | (rd_bits << 8) | rn_lo_bits) as u16;
3603                bytes.extend_from_slice(&hw1.to_le_bytes());
3604                bytes.extend_from_slice(&hw2.to_le_bytes());
3605
3606                // ADD.W rd, rd, #32 (4 bytes at offset 18)
3607                let hw1: u16 = (0xF100 | rd_bits) as u16;
3608                let hw2: u16 = ((rd_bits << 8) | 0x20) as u16;
3609                bytes.extend_from_slice(&hw1.to_le_bytes());
3610                bytes.extend_from_slice(&hw2.to_le_bytes());
3611
3612                // .done: (offset 22)
3613                // i64.clz returns i64, so clear high word: MOV rnhi, #0 (2 bytes)
3614                // MOVS Rn, #0: 0010 0 Rn 00000000
3615                let mov0: u16 = (0x2000 | (rn_hi_bits << 8)) as u16;
3616                bytes.extend_from_slice(&mov0.to_le_bytes());
3617
3618                Ok(bytes)
3619            }
3620
3621            // I64Ctz: Count trailing zeros in 64-bit value
3622            // If lo != 0: result = CTZ(lo) = CLZ(RBIT(lo))
3623            // If lo == 0: result = 32 + CTZ(hi) = 32 + CLZ(RBIT(hi))
3624            //
3625            // Layout:
3626            // 0: CMP.W rnlo, #0 (4 bytes)
3627            // 4: BEQ .lo_zero (2 bytes) - branch to offset 18
3628            // 6: RBIT.W rd, rnlo (4 bytes)
3629            // 10: CLZ.W rd, rd (4 bytes)
3630            // 14: B .done (2 bytes) - branch to offset 30
3631            // 16: NOP (2 bytes) - padding
3632            // 18: .lo_zero: RBIT.W rd, rnhi (4 bytes)
3633            // 22: CLZ.W rd, rd (4 bytes)
3634            // 26: ADD.W rd, rd, #32 (4 bytes)
3635            // 30: .done
3636            ArmOp::I64Ctz { rd, rnlo, rnhi } => {
3637                let rd_bits = reg_to_bits(rd);
3638                let rn_lo_bits = reg_to_bits(rnlo);
3639                let rn_hi_bits = reg_to_bits(rnhi);
3640                let mut bytes = Vec::new();
3641
3642                // CMP.W rnlo, #0 (4 bytes at offset 0)
3643                let hw1: u16 = (0xF1B0 | rn_lo_bits) as u16;
3644                let hw2: u16 = 0x0F00;
3645                bytes.extend_from_slice(&hw1.to_le_bytes());
3646                bytes.extend_from_slice(&hw2.to_le_bytes());
3647
3648                // BEQ .lo_zero (2 bytes at offset 4)
3649                // PC = 4 + 4 = 8, target = 18, offset = 10, imm8 = 5
3650                let beq: u16 = 0xD005;
3651                bytes.extend_from_slice(&beq.to_le_bytes());
3652
3653                // RBIT.W rd, rnlo (4 bytes at offset 6)
3654                // RBIT T1: hw1 = 0xFA9<Rm>, hw2 = 0xF<Rd>A<Rm>
3655                let hw1: u16 = (0xFA90 | rn_lo_bits) as u16;
3656                let hw2: u16 = (0xF0A0 | (rd_bits << 8) | rn_lo_bits) as u16;
3657                bytes.extend_from_slice(&hw1.to_le_bytes());
3658                bytes.extend_from_slice(&hw2.to_le_bytes());
3659
3660                // CLZ.W rd, rd (4 bytes at offset 10)
3661                // CLZ T1: hw1 = 0xFAB<Rm>, hw2 = 0xF<Rd>8<Rm>
3662                let hw1: u16 = (0xFAB0 | rd_bits) as u16;
3663                let hw2: u16 = (0xF080 | (rd_bits << 8) | rd_bits) as u16;
3664                bytes.extend_from_slice(&hw1.to_le_bytes());
3665                bytes.extend_from_slice(&hw2.to_le_bytes());
3666
3667                // B .done (2 bytes at offset 14)
3668                // PC = 14 + 4 = 18, target = 30, offset = 12, imm11 = 6
3669                let b_done: u16 = 0xE006;
3670                bytes.extend_from_slice(&b_done.to_le_bytes());
3671
3672                // NOP (2 bytes at offset 16) - padding
3673                bytes.extend_from_slice(&0xBF00u16.to_le_bytes());
3674
3675                // .lo_zero: (offset 18)
3676                // RBIT.W rd, rnhi (4 bytes)
3677                // RBIT T1: hw1 = 0xFA9<Rm>, hw2 = 0xF<Rd>A<Rm>
3678                let hw1: u16 = (0xFA90 | rn_hi_bits) as u16;
3679                let hw2: u16 = (0xF0A0 | (rd_bits << 8) | rn_hi_bits) as u16;
3680                bytes.extend_from_slice(&hw1.to_le_bytes());
3681                bytes.extend_from_slice(&hw2.to_le_bytes());
3682
3683                // CLZ.W rd, rd (4 bytes at offset 22)
3684                // CLZ T1: hw1 = 0xFAB<Rm>, hw2 = 0xF<Rd>8<Rm>
3685                let hw1: u16 = (0xFAB0 | rd_bits) as u16;
3686                let hw2: u16 = (0xF080 | (rd_bits << 8) | rd_bits) as u16;
3687                bytes.extend_from_slice(&hw1.to_le_bytes());
3688                bytes.extend_from_slice(&hw2.to_le_bytes());
3689
3690                // ADD.W rd, rd, #32 (4 bytes at offset 26)
3691                let hw1: u16 = (0xF100 | rd_bits) as u16;
3692                let hw2: u16 = ((rd_bits << 8) | 0x20) as u16;
3693                bytes.extend_from_slice(&hw1.to_le_bytes());
3694                bytes.extend_from_slice(&hw2.to_le_bytes());
3695
3696                // .done: (offset 30)
3697                // i64.ctz returns i64, so clear high word: MOV rnhi, #0 (2 bytes)
3698                let mov0: u16 = (0x2000 | (rn_hi_bits << 8)) as u16;
3699                bytes.extend_from_slice(&mov0.to_le_bytes());
3700
3701                Ok(bytes)
3702            }
3703
3704            // I64Popcnt: Population count of 64-bit value
3705            // result = POPCNT(lo) + POPCNT(hi)
3706            // Using SIMD-style parallel bit counting algorithm
3707            ArmOp::I64Popcnt { rd, rnlo, rnhi } => {
3708                let rd_bits = reg_to_bits(rd);
3709                let rn_lo_bits = reg_to_bits(rnlo);
3710                let rn_hi_bits = reg_to_bits(rnhi);
3711                let r12: u32 = 12; // IP scratch
3712                let r3: u32 = 3; // Scratch for hi popcnt result
3713                let mut bytes = Vec::new();
3714
3715                // PUSH {R3, R4, R5} - save scratch registers
3716                bytes.extend_from_slice(&0xB438u16.to_le_bytes());
3717
3718                // Strategy: compute popcnt(lo) -> R4, popcnt(hi) -> R5, add them -> rd
3719                // Using lookup table approach for each byte would be too large
3720                // Using shift-and-add approach instead
3721
3722                // For simplicity and correctness, use the efficient parallel algorithm
3723                // but implement it as a series of inline operations
3724
3725                // MOV R4, rnlo
3726                let d_bit: u32 = 0; // R4 < 8, so high bit is 0
3727                let mov: u16 = (0x4600 | (d_bit << 7) | (rn_lo_bits << 3) | (4 & 0x7)) as u16;
3728                bytes.extend_from_slice(&mov.to_le_bytes());
3729
3730                // MOV R5, rnhi
3731                let d_bit: u32 = 0; // R5 < 8, so high bit is 0
3732                let mov: u16 = (0x4600 | (d_bit << 7) | (rn_hi_bits << 3) | (5 & 0x7)) as u16;
3733                bytes.extend_from_slice(&mov.to_le_bytes());
3734
3735                // --- POPCNT for R4 (lo word) ---
3736                // Step 1: x = x - ((x >> 1) & 0x55555555)
3737                // LSR.W R12, R4, #1
3738                let hw1: u16 = 0xEA4F;
3739                let hw2: u16 = ((r12 << 8) | 0x50 | 4) as u16;
3740                bytes.extend_from_slice(&hw1.to_le_bytes());
3741                bytes.extend_from_slice(&hw2.to_le_bytes());
3742
3743                // Load 0x55555555 into R3 using MOVW/MOVT
3744                // MOVW R3, #0x5555
3745                bytes.extend_from_slice(&0xF245u16.to_le_bytes());
3746                bytes.extend_from_slice(&0x5355u16.to_le_bytes());
3747                // MOVT R3, #0x5555
3748                bytes.extend_from_slice(&0xF2C5u16.to_le_bytes());
3749                bytes.extend_from_slice(&0x5355u16.to_le_bytes());
3750
3751                // AND.W R12, R12, R3
3752                let hw1: u16 = (0xEA00 | r12) as u16;
3753                let hw2: u16 = ((r12 << 8) | r3) as u16;
3754                bytes.extend_from_slice(&hw1.to_le_bytes());
3755                bytes.extend_from_slice(&hw2.to_le_bytes());
3756
3757                // SUB.W R4, R4, R12
3758                let hw1: u16 = (0xEBA0 | 4) as u16;
3759                let hw2: u16 = ((4 << 8) | r12) as u16;
3760                bytes.extend_from_slice(&hw1.to_le_bytes());
3761                bytes.extend_from_slice(&hw2.to_le_bytes());
3762
3763                // Step 2: x = (x & 0x33333333) + ((x >> 2) & 0x33333333)
3764                // Load 0x33333333 into R3
3765                // MOVW R3, #0x3333
3766                bytes.extend_from_slice(&0xF243u16.to_le_bytes());
3767                bytes.extend_from_slice(&0x3333u16.to_le_bytes());
3768                // MOVT R3, #0x3333
3769                bytes.extend_from_slice(&0xF2C3u16.to_le_bytes());
3770                bytes.extend_from_slice(&0x3333u16.to_le_bytes());
3771
3772                // AND.W R12, R4, R3
3773                let hw1: u16 = (0xEA00 | 4) as u16;
3774                let hw2: u16 = ((r12 << 8) | r3) as u16;
3775                bytes.extend_from_slice(&hw1.to_le_bytes());
3776                bytes.extend_from_slice(&hw2.to_le_bytes());
3777
3778                // LSR.W R4, R4, #2
3779                let hw1: u16 = 0xEA4F;
3780                let hw2: u16 = ((4 << 8) | 0x90 | 4) as u16;
3781                bytes.extend_from_slice(&hw1.to_le_bytes());
3782                bytes.extend_from_slice(&hw2.to_le_bytes());
3783
3784                // AND.W R4, R4, R3
3785                let hw1: u16 = (0xEA00 | 4) as u16;
3786                let hw2: u16 = ((4 << 8) | r3) as u16;
3787                bytes.extend_from_slice(&hw1.to_le_bytes());
3788                bytes.extend_from_slice(&hw2.to_le_bytes());
3789
3790                // ADD.W R4, R4, R12
3791                let hw1: u16 = (0xEB00 | 4) as u16;
3792                let hw2: u16 = ((4 << 8) | r12) as u16;
3793                bytes.extend_from_slice(&hw1.to_le_bytes());
3794                bytes.extend_from_slice(&hw2.to_le_bytes());
3795
3796                // Step 3: x = (x + (x >> 4)) & 0x0F0F0F0F
3797                // LSR.W R12, R4, #4
3798                // hw2 = (imm3 << 12) | (Rd << 8) | (imm2 << 6) | (type << 4) | Rm
3799                // imm5=4=00100 → imm3=1, imm2=0, type=01(LSR)
3800                let hw1: u16 = 0xEA4F;
3801                let hw2: u16 = (0x1000 | (r12 << 8) | 0x10 | 4) as u16;
3802                bytes.extend_from_slice(&hw1.to_le_bytes());
3803                bytes.extend_from_slice(&hw2.to_le_bytes());
3804
3805                // ADD.W R4, R4, R12
3806                let hw1: u16 = (0xEB00 | 4) as u16;
3807                let hw2: u16 = ((4 << 8) | r12) as u16;
3808                bytes.extend_from_slice(&hw1.to_le_bytes());
3809                bytes.extend_from_slice(&hw2.to_le_bytes());
3810
3811                // Load 0x0F0F0F0F into R3
3812                // MOVW R3, #0x0F0F (imm4=0, i=1, imm3=7, imm8=0x0F)
3813                // hw1 = 11110 1 10 0100 0000 = 0xF640
3814                // hw2 = 0 111 0011 00001111 = 0x730F
3815                bytes.extend_from_slice(&0xF640u16.to_le_bytes());
3816                bytes.extend_from_slice(&0x730Fu16.to_le_bytes());
3817                // MOVT R3, #0x0F0F
3818                bytes.extend_from_slice(&0xF6C0u16.to_le_bytes());
3819                bytes.extend_from_slice(&0x730Fu16.to_le_bytes());
3820
3821                // AND.W R4, R4, R3
3822                let hw1: u16 = (0xEA00 | 4) as u16;
3823                let hw2: u16 = ((4 << 8) | r3) as u16;
3824                bytes.extend_from_slice(&hw1.to_le_bytes());
3825                bytes.extend_from_slice(&hw2.to_le_bytes());
3826
3827                // Step 4: x = x * 0x01010101 >> 24
3828                // Load 0x01010101 into R3
3829                // MOVW R3, #0x0101
3830                bytes.extend_from_slice(&0xF240u16.to_le_bytes());
3831                bytes.extend_from_slice(&0x1301u16.to_le_bytes());
3832                // MOVT R3, #0x0101
3833                bytes.extend_from_slice(&0xF2C0u16.to_le_bytes());
3834                bytes.extend_from_slice(&0x1301u16.to_le_bytes());
3835
3836                // MUL R4, R4, R3
3837                // MUL T2: hw1 = 0xFB00|Rn, hw2 = 0xF000|(Rd<<8)|Rm
3838                let hw1: u16 = (0xFB00 | 4) as u16;
3839                let hw2: u16 = (0xF000 | (4 << 8) | r3) as u16;
3840                bytes.extend_from_slice(&hw1.to_le_bytes());
3841                bytes.extend_from_slice(&hw2.to_le_bytes());
3842
3843                // LSR.W R4, R4, #24
3844                // imm5=24=11000 → imm3=6, imm2=0, type=01(LSR)
3845                let hw1: u16 = 0xEA4F;
3846                let hw2: u16 = (0x6000 | (4 << 8) | 0x10 | 4) as u16;
3847                bytes.extend_from_slice(&hw1.to_le_bytes());
3848                bytes.extend_from_slice(&hw2.to_le_bytes());
3849
3850                // --- POPCNT for R5 (hi word) - same algorithm ---
3851                // Step 1
3852                let hw1: u16 = 0xEA4F;
3853                let hw2: u16 = ((r12 << 8) | 0x50 | 5) as u16;
3854                bytes.extend_from_slice(&hw1.to_le_bytes());
3855                bytes.extend_from_slice(&hw2.to_le_bytes());
3856
3857                // Load 0x55555555 into R3
3858                bytes.extend_from_slice(&0xF245u16.to_le_bytes());
3859                bytes.extend_from_slice(&0x5355u16.to_le_bytes());
3860                bytes.extend_from_slice(&0xF2C5u16.to_le_bytes());
3861                bytes.extend_from_slice(&0x5355u16.to_le_bytes());
3862
3863                let hw1: u16 = (0xEA00 | r12) as u16;
3864                let hw2: u16 = ((r12 << 8) | r3) as u16;
3865                bytes.extend_from_slice(&hw1.to_le_bytes());
3866                bytes.extend_from_slice(&hw2.to_le_bytes());
3867
3868                let hw1: u16 = (0xEBA0 | 5) as u16;
3869                let hw2: u16 = ((5 << 8) | r12) as u16;
3870                bytes.extend_from_slice(&hw1.to_le_bytes());
3871                bytes.extend_from_slice(&hw2.to_le_bytes());
3872
3873                // Step 2
3874                bytes.extend_from_slice(&0xF243u16.to_le_bytes());
3875                bytes.extend_from_slice(&0x3333u16.to_le_bytes());
3876                bytes.extend_from_slice(&0xF2C3u16.to_le_bytes());
3877                bytes.extend_from_slice(&0x3333u16.to_le_bytes());
3878
3879                let hw1: u16 = (0xEA00 | 5) as u16;
3880                let hw2: u16 = ((r12 << 8) | r3) as u16;
3881                bytes.extend_from_slice(&hw1.to_le_bytes());
3882                bytes.extend_from_slice(&hw2.to_le_bytes());
3883
3884                let hw1: u16 = 0xEA4F;
3885                let hw2: u16 = ((5 << 8) | 0x90 | 5) as u16;
3886                bytes.extend_from_slice(&hw1.to_le_bytes());
3887                bytes.extend_from_slice(&hw2.to_le_bytes());
3888
3889                let hw1: u16 = (0xEA00 | 5) as u16;
3890                let hw2: u16 = ((5 << 8) | r3) as u16;
3891                bytes.extend_from_slice(&hw1.to_le_bytes());
3892                bytes.extend_from_slice(&hw2.to_le_bytes());
3893
3894                let hw1: u16 = (0xEB00 | 5) as u16;
3895                let hw2: u16 = ((5 << 8) | r12) as u16;
3896                bytes.extend_from_slice(&hw1.to_le_bytes());
3897                bytes.extend_from_slice(&hw2.to_le_bytes());
3898
3899                // Step 3: LSR.W R12, R5, #4
3900                // imm5=4=00100 → imm3=1, imm2=0, type=01(LSR)
3901                let hw1: u16 = 0xEA4F;
3902                let hw2: u16 = (0x1000 | (r12 << 8) | 0x10 | 5) as u16;
3903                bytes.extend_from_slice(&hw1.to_le_bytes());
3904                bytes.extend_from_slice(&hw2.to_le_bytes());
3905
3906                let hw1: u16 = (0xEB00 | 5) as u16;
3907                let hw2: u16 = ((5 << 8) | r12) as u16;
3908                bytes.extend_from_slice(&hw1.to_le_bytes());
3909                bytes.extend_from_slice(&hw2.to_le_bytes());
3910
3911                // Load 0x0F0F0F0F into R3 (for hi-word)
3912                bytes.extend_from_slice(&0xF640u16.to_le_bytes());
3913                bytes.extend_from_slice(&0x730Fu16.to_le_bytes());
3914                bytes.extend_from_slice(&0xF6C0u16.to_le_bytes());
3915                bytes.extend_from_slice(&0x730Fu16.to_le_bytes());
3916
3917                let hw1: u16 = (0xEA00 | 5) as u16;
3918                let hw2: u16 = ((5 << 8) | r3) as u16;
3919                bytes.extend_from_slice(&hw1.to_le_bytes());
3920                bytes.extend_from_slice(&hw2.to_le_bytes());
3921
3922                // Step 4
3923                bytes.extend_from_slice(&0xF240u16.to_le_bytes());
3924                bytes.extend_from_slice(&0x1301u16.to_le_bytes());
3925                bytes.extend_from_slice(&0xF2C0u16.to_le_bytes());
3926                bytes.extend_from_slice(&0x1301u16.to_le_bytes());
3927
3928                // MUL R5, R5, R3
3929                // MUL T2: hw1 = 0xFB00|Rn, hw2 = 0xF000|(Rd<<8)|Rm
3930                let hw1: u16 = (0xFB00 | 5) as u16;
3931                let hw2: u16 = (0xF000 | (5 << 8) | r3) as u16;
3932                bytes.extend_from_slice(&hw1.to_le_bytes());
3933                bytes.extend_from_slice(&hw2.to_le_bytes());
3934
3935                // LSR.W R5, R5, #24
3936                // imm5=24=11000 → imm3=6, imm2=0, type=01(LSR)
3937                let hw1: u16 = 0xEA4F;
3938                let hw2: u16 = (0x6000 | (5 << 8) | 0x10 | 5) as u16;
3939                bytes.extend_from_slice(&hw1.to_le_bytes());
3940                bytes.extend_from_slice(&hw2.to_le_bytes());
3941
3942                // ADD rd, R4, R5 (combine lo and hi counts)
3943                // ADDS Rd, Rn, Rm (T1): 0001 100 Rm Rn Rd = 0x1800 | (Rm<<6) | (Rn<<3) | Rd
3944                let rd_bits_u16 = rd_bits as u16;
3945                let instr: u16 = 0x1800 | (5 << 6) | (4 << 3) | rd_bits_u16;
3946                bytes.extend_from_slice(&instr.to_le_bytes());
3947
3948                // POP {R3, R4, R5}
3949                bytes.extend_from_slice(&0xBC38u16.to_le_bytes());
3950
3951                // i64.popcnt returns i64, so clear high word: MOV rnhi, #0 (2 bytes)
3952                let mov0: u16 = (0x2000 | (rn_hi_bits << 8)) as u16;
3953                bytes.extend_from_slice(&mov0.to_le_bytes());
3954
3955                Ok(bytes)
3956            }
3957
3958            // I64Extend8S: Sign-extend low 8 bits to 64 bits
3959            // Result: rdlo = sign_extend_8(rnlo), rdhi = rdlo >> 31
3960            ArmOp::I64Extend8S { rdlo, rdhi, rnlo } => {
3961                let rdlo_bits = reg_to_bits(rdlo);
3962                let rdhi_bits = reg_to_bits(rdhi);
3963                let rnlo_bits = reg_to_bits(rnlo);
3964                let mut bytes = Vec::new();
3965
3966                // SXTB.W rdlo, rnlo (sign-extend byte to 32-bit)
3967                // SXTB T2: hw1 = 0xFA4F, hw2 = 0xF0<Rd><Rm>
3968                let hw1: u16 = 0xFA4F_u16;
3969                let hw2: u16 = (0xF080 | (rdlo_bits << 8) | rnlo_bits) as u16;
3970                bytes.extend_from_slice(&hw1.to_le_bytes());
3971                bytes.extend_from_slice(&hw2.to_le_bytes());
3972
3973                // ASR.W rdhi, rdlo, #31 (sign-extend to high word)
3974                // ASR (immediate): hw1 = 0xEA4F, hw2 = imm3:Rd:imm2:type:Rm
3975                // For imm5=31: imm3=111, imm2=11, type=10 (ASR)
3976                // hw2 = (7 << 12) | (rdhi << 8) | (3 << 6) | (2 << 4) | rdlo
3977                let hw1: u16 = 0xEA4F;
3978                let hw2: u16 = (0x70E0 | (rdhi_bits << 8) | rdlo_bits) as u16;
3979                bytes.extend_from_slice(&hw1.to_le_bytes());
3980                bytes.extend_from_slice(&hw2.to_le_bytes());
3981
3982                Ok(bytes)
3983            }
3984
3985            // I64Extend16S: Sign-extend low 16 bits to 64 bits
3986            // Result: rdlo = sign_extend_16(rnlo), rdhi = rdlo >> 31
3987            ArmOp::I64Extend16S { rdlo, rdhi, rnlo } => {
3988                let rdlo_bits = reg_to_bits(rdlo);
3989                let rdhi_bits = reg_to_bits(rdhi);
3990                let rnlo_bits = reg_to_bits(rnlo);
3991                let mut bytes = Vec::new();
3992
3993                // SXTH.W rdlo, rnlo (sign-extend halfword to 32-bit)
3994                // SXTH T2: hw1 = 0xFA0F, hw2 = 0xF0<Rd><Rm>
3995                let hw1: u16 = 0xFA0F_u16;
3996                let hw2: u16 = (0xF080 | (rdlo_bits << 8) | rnlo_bits) as u16;
3997                bytes.extend_from_slice(&hw1.to_le_bytes());
3998                bytes.extend_from_slice(&hw2.to_le_bytes());
3999
4000                // ASR.W rdhi, rdlo, #31 (sign-extend to high word)
4001                let hw1: u16 = 0xEA4F;
4002                let hw2: u16 = (0x70E0 | (rdhi_bits << 8) | rdlo_bits) as u16;
4003                bytes.extend_from_slice(&hw1.to_le_bytes());
4004                bytes.extend_from_slice(&hw2.to_le_bytes());
4005
4006                Ok(bytes)
4007            }
4008
4009            // I64Extend32S: Sign-extend low 32 bits to 64 bits
4010            // Result: rdlo = rnlo, rdhi = rnlo >> 31
4011            ArmOp::I64Extend32S { rdlo, rdhi, rnlo } => {
4012                let rdlo_bits = reg_to_bits(rdlo);
4013                let rdhi_bits = reg_to_bits(rdhi);
4014                let rnlo_bits = reg_to_bits(rnlo);
4015                let mut bytes = Vec::new();
4016
4017                // MOV rdlo, rnlo (if different)
4018                if rdlo_bits != rnlo_bits {
4019                    // MOV Rd, Rm (16-bit): 0100 0110 D Rm Rd[2:0]
4020                    let d_bit = ((rdlo_bits >> 3) & 1) as u16;
4021                    let mov: u16 = 0x4600
4022                        | (d_bit << 7)
4023                        | ((rnlo_bits as u16) << 3)
4024                        | ((rdlo_bits & 0x7) as u16);
4025                    bytes.extend_from_slice(&mov.to_le_bytes());
4026                }
4027
4028                // ASR.W rdhi, rnlo, #31 (sign-extend to high word)
4029                let hw1: u16 = 0xEA4F;
4030                let hw2: u16 = (0x70E0 | (rdhi_bits << 8) | rnlo_bits) as u16;
4031                bytes.extend_from_slice(&hw1.to_le_bytes());
4032                bytes.extend_from_slice(&hw2.to_le_bytes());
4033
4034                Ok(bytes)
4035            }
4036
4037            // SelectMove: IT <cond>; MOV{cond} rd, rm
4038            // Conditional move: only execute MOV if condition is true
4039            ArmOp::SelectMove { rd, rm, cond } => {
4040                let rd_bits = reg_to_bits(rd) as u16;
4041                let rm_bits = reg_to_bits(rm) as u16;
4042
4043                // Condition code encoding for IT block
4044                use synth_synthesis::Condition;
4045                let cond_bits: u16 = match cond {
4046                    Condition::EQ => 0x0, // Equal
4047                    Condition::NE => 0x1, // Not equal
4048                    Condition::HS => 0x2, // Higher or same (unsigned >=)
4049                    Condition::LO => 0x3, // Lower (unsigned <)
4050                    Condition::HI => 0x8, // Higher (unsigned >)
4051                    Condition::LS => 0x9, // Lower or same (unsigned <=)
4052                    Condition::GE => 0xA, // Greater or equal (signed)
4053                    Condition::LT => 0xB, // Less than (signed)
4054                    Condition::GT => 0xC, // Greater than (signed)
4055                    Condition::LE => 0xD, // Less or equal (signed)
4056                };
4057
4058                // IT <cond>: single Then block (mask = 0x8 for T only)
4059                // IT instruction: 1011 1111 firstcond mask
4060                let it_instr: u16 = 0xBF00 | (cond_bits << 4) | 0x8;
4061
4062                // MOV Rd, Rm (16-bit): 0100 0110 D Rm Rd[2:0]
4063                // This MOV will only execute if condition is true due to IT block
4064                let d_bit = (rd_bits >> 3) & 1;
4065                let mov_instr: u16 = 0x4600 | (d_bit << 7) | (rm_bits << 3) | (rd_bits & 0x7);
4066
4067                // Emit: IT <cond>, MOV rd, rm
4068                let mut bytes = it_instr.to_le_bytes().to_vec();
4069                bytes.extend_from_slice(&mov_instr.to_le_bytes());
4070                Ok(bytes)
4071            }
4072
4073            // Popcnt: Population count (count set bits)
4074            // ARM Cortex-M has no native POPCNT, so we implement the bit manipulation algorithm:
4075            // x = x - ((x >> 1) & 0x55555555);
4076            // x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
4077            // x = (x + (x >> 4)) & 0x0F0F0F0F;
4078            // x = x + (x >> 8);
4079            // x = x + (x >> 16);
4080            // return x & 0x3F;
4081            //
4082            // Uses rd as working register and R12 as scratch for constants
4083            ArmOp::Popcnt { rd, rm } => {
4084                let mut bytes = Vec::new();
4085
4086                // First, move rm to rd if they're different
4087                if rd != rm {
4088                    let rd_bits = reg_to_bits(rd) as u16;
4089                    let rm_bits = reg_to_bits(rm) as u16;
4090                    // MOV Rd, Rm (16-bit): 0100 0110 D Rm Rd[2:0]
4091                    let d_bit = (rd_bits >> 3) & 1;
4092                    let mov_instr: u16 = 0x4600 | (d_bit << 7) | (rm_bits << 3) | (rd_bits & 0x7);
4093                    bytes.extend_from_slice(&mov_instr.to_le_bytes());
4094                }
4095
4096                // Step 1: x = x - ((x >> 1) & 0x55555555)
4097                // Load 0x55555555 into R12
4098                bytes.extend_from_slice(&self.encode_thumb32_movw_raw(12, 0x5555)?);
4099                bytes.extend_from_slice(&self.encode_thumb32_movt_raw(12, 0x5555)?);
4100
4101                // R12_temp = rd >> 1
4102                // We need a second scratch register. Use R11.
4103                bytes.extend_from_slice(&self.encode_thumb32_lsr_raw(11, reg_to_bits(rd), 1)?);
4104
4105                // R11 = R11 & R12 (R11 = (x >> 1) & 0x55555555)
4106                bytes.extend_from_slice(&self.encode_thumb32_and_reg_raw(11, 11, 12)?);
4107
4108                // rd = rd - R11
4109                bytes.extend_from_slice(&self.encode_thumb32_sub_reg_raw(
4110                    reg_to_bits(rd),
4111                    reg_to_bits(rd),
4112                    11,
4113                )?);
4114
4115                // Step 2: x = (x & 0x33333333) + ((x >> 2) & 0x33333333)
4116                // Load 0x33333333 into R12
4117                bytes.extend_from_slice(&self.encode_thumb32_movw_raw(12, 0x3333)?);
4118                bytes.extend_from_slice(&self.encode_thumb32_movt_raw(12, 0x3333)?);
4119
4120                // R11 = rd & R12
4121                bytes.extend_from_slice(&self.encode_thumb32_and_reg_raw(
4122                    11,
4123                    reg_to_bits(rd),
4124                    12,
4125                )?);
4126
4127                // rd = rd >> 2
4128                bytes.extend_from_slice(&self.encode_thumb32_lsr_raw(
4129                    reg_to_bits(rd),
4130                    reg_to_bits(rd),
4131                    2,
4132                )?);
4133
4134                // rd = rd & R12
4135                bytes.extend_from_slice(&self.encode_thumb32_and_reg_raw(
4136                    reg_to_bits(rd),
4137                    reg_to_bits(rd),
4138                    12,
4139                )?);
4140
4141                // rd = rd + R11
4142                bytes.extend_from_slice(&self.encode_thumb32_add_reg_raw(
4143                    reg_to_bits(rd),
4144                    reg_to_bits(rd),
4145                    11,
4146                )?);
4147
4148                // Step 3: x = (x + (x >> 4)) & 0x0F0F0F0F
4149                // R11 = rd >> 4
4150                bytes.extend_from_slice(&self.encode_thumb32_lsr_raw(11, reg_to_bits(rd), 4)?);
4151
4152                // rd = rd + R11
4153                bytes.extend_from_slice(&self.encode_thumb32_add_reg_raw(
4154                    reg_to_bits(rd),
4155                    reg_to_bits(rd),
4156                    11,
4157                )?);
4158
4159                // Load 0x0F0F0F0F into R12
4160                bytes.extend_from_slice(&self.encode_thumb32_movw_raw(12, 0x0F0F)?);
4161                bytes.extend_from_slice(&self.encode_thumb32_movt_raw(12, 0x0F0F)?);
4162
4163                // rd = rd & R12
4164                bytes.extend_from_slice(&self.encode_thumb32_and_reg_raw(
4165                    reg_to_bits(rd),
4166                    reg_to_bits(rd),
4167                    12,
4168                )?);
4169
4170                // Step 4: x = x + (x >> 8)
4171                // R11 = rd >> 8
4172                bytes.extend_from_slice(&self.encode_thumb32_lsr_raw(11, reg_to_bits(rd), 8)?);
4173
4174                // rd = rd + R11
4175                bytes.extend_from_slice(&self.encode_thumb32_add_reg_raw(
4176                    reg_to_bits(rd),
4177                    reg_to_bits(rd),
4178                    11,
4179                )?);
4180
4181                // Step 5: x = x + (x >> 16)
4182                // R11 = rd >> 16
4183                bytes.extend_from_slice(&self.encode_thumb32_lsr_raw(11, reg_to_bits(rd), 16)?);
4184
4185                // rd = rd + R11
4186                bytes.extend_from_slice(&self.encode_thumb32_add_reg_raw(
4187                    reg_to_bits(rd),
4188                    reg_to_bits(rd),
4189                    11,
4190                )?);
4191
4192                // Step 6: return x & 0x3F
4193                // AND with 0x3F (small immediate, can use BIC or AND with immediate)
4194                bytes.extend_from_slice(&self.encode_thumb32_and_imm_raw(
4195                    reg_to_bits(rd),
4196                    reg_to_bits(rd),
4197                    0x3F,
4198                )?);
4199
4200                Ok(bytes)
4201            }
4202
4203            // I64DivU: 64-bit unsigned division using binary long division
4204            // Input: R0:R1 = dividend, R2:R3 = divisor
4205            // Output: R0:R1 = quotient
4206            // Uses: R4-R7, R12 as loop counter (avoid R8 for Renode compatibility)
4207            ArmOp::I64DivU {
4208                rdlo: _,
4209                rdhi: _,
4210                rnlo: _,
4211                rnhi: _,
4212                rmlo: _,
4213                rmhi: _,
4214            } => {
4215                let mut bytes = Vec::new();
4216
4217                // PUSH {R4-R7} - save scratch registers (NO LR — this is inline code)
4218                // 16-bit PUSH: 1011 010 M rrrrrrrr where M=0 (no LR), r=R4-R7 = 0xF0
4219                // Encoding: 1011 0100 1111 0000 = 0xB4F0
4220                bytes.extend_from_slice(&0xB4F0u16.to_le_bytes());
4221
4222                // Initialize quotient (R4:R5) = 0
4223                bytes.extend_from_slice(&0x2400u16.to_le_bytes()); // MOV R4, #0
4224                bytes.extend_from_slice(&0x2500u16.to_le_bytes()); // MOV R5, #0
4225
4226                // Initialize remainder (R6:R7) = 0
4227                bytes.extend_from_slice(&0x2600u16.to_le_bytes()); // MOV R6, #0
4228                bytes.extend_from_slice(&0x2700u16.to_le_bytes()); // MOV R7, #0
4229
4230                // Initialize loop counter R12 = 64 (use R12 scratch instead of R8)
4231                // MOV.W R12, #64: F04F 0C40
4232                bytes.extend_from_slice(&0xF04Fu16.to_le_bytes());
4233                bytes.extend_from_slice(&0x0C40u16.to_le_bytes());
4234
4235                // Loop start
4236                let loop_start = bytes.len();
4237
4238                // === Loop body: process one bit ===
4239
4240                // 1. Shift quotient R4:R5 left by 1
4241                // LSLS R5, R5, #1 (16-bit: 0000 0010 1010 1101 = 0x006D -> actually 0x002D for LSL R5,R5,#1)
4242                // LSL Rd, Rm, #imm5: 000 00 imm5 Rm Rd = 000 00 00001 101 101 = 0x006D
4243                bytes.extend_from_slice(&0x006Du16.to_le_bytes()); // LSLS R5, R5, #1
4244                // Get carry from R4 into R5: ORR R5, R5, R4 LSR #31
4245                // Thumb-2 ORR with shifted register: EA45 75D4 = ORR.W R5, R5, R4, LSR #31
4246                // 11101010 010 S Rn | 0 imm3 Rd imm2 type Rm
4247                // type=01 (LSR), imm5=31 (imm3=111, imm2=11)
4248                bytes.extend_from_slice(&0xEA45u16.to_le_bytes());
4249                bytes.extend_from_slice(&0x75D4u16.to_le_bytes()); // ORR.W R5, R5, R4, LSR #31
4250                // LSLS R4, R4, #1: 000 00 00001 100 100 = 0x0064
4251                bytes.extend_from_slice(&0x0064u16.to_le_bytes()); // LSLS R4, R4, #1
4252
4253                // 2. Shift remainder R6:R7 left by 1, OR in MSB of dividend R1
4254                // LSLS R7, R7, #1
4255                bytes.extend_from_slice(&0x007Fu16.to_le_bytes()); // LSLS R7, R7, #1
4256                // ORR.W R7, R7, R6, LSR #31
4257                bytes.extend_from_slice(&0xEA47u16.to_le_bytes());
4258                bytes.extend_from_slice(&0x77D6u16.to_le_bytes());
4259                // LSLS R6, R6, #1
4260                bytes.extend_from_slice(&0x0076u16.to_le_bytes()); // LSLS R6, R6, #1
4261                // ORR.W R6, R6, R1, LSR #31 (bring in MSB of dividend high)
4262                bytes.extend_from_slice(&0xEA46u16.to_le_bytes());
4263                bytes.extend_from_slice(&0x76D1u16.to_le_bytes());
4264
4265                // 3. Shift dividend R0:R1 left by 1
4266                // LSLS R1, R1, #1
4267                bytes.extend_from_slice(&0x0049u16.to_le_bytes()); // LSLS R1, R1, #1
4268                // ORR.W R1, R1, R0, LSR #31
4269                bytes.extend_from_slice(&0xEA41u16.to_le_bytes());
4270                bytes.extend_from_slice(&0x71D0u16.to_le_bytes());
4271                // LSLS R0, R0, #1
4272                bytes.extend_from_slice(&0x0040u16.to_le_bytes()); // LSLS R0, R0, #1
4273
4274                // 4. Compare remainder >= divisor (64-bit unsigned comparison)
4275                // Compare high words first: CMP R7, R3
4276                // CMP Rn, Rm encoding: 0x4280 | (Rm << 3) | Rn
4277                bytes.extend_from_slice(&0x429Fu16.to_le_bytes()); // CMP R7, R3 (16-bit)
4278                // BHI means R7 > R3 (unsigned) - definitely subtract
4279                // BLO means R7 < R3 - definitely don't subtract
4280                // BEQ means need to check low words
4281
4282                // If high > divisor high: branch to subtract (forward +offset)
4283                // BHI.N +6 (skip CMP, skip BLO, do subtract)
4284                // BHI: 1101 1000 offset8 where cond=1000 (HI)
4285                bytes.extend_from_slice(&0xD802u16.to_le_bytes()); // BHI +4 (to subtract block)
4286
4287                // If high < divisor high: branch past subtract
4288                // BLO.N +10 (skip to decrement)
4289                bytes.extend_from_slice(&0xD306u16.to_le_bytes()); // BLO/BCC +12 (past subtract)
4290
4291                // High words equal, compare low: CMP R6, R2
4292                bytes.extend_from_slice(&0x4296u16.to_le_bytes()); // CMP R6, R2 (16-bit)
4293                // BLO/BCC past subtract (skip SUBS+SBC.W+ORR.W = 10 bytes = 4 halfwords from PC+4)
4294                bytes.extend_from_slice(&0xD304u16.to_le_bytes()); // BCC +4 halfwords (past subtract)
4295
4296                // === Subtract block: remainder -= divisor, quotient |= 1 ===
4297                // SUBS R6, R6, R2
4298                bytes.extend_from_slice(&0x1AB6u16.to_le_bytes()); // SUBS R6, R6, R2 (16-bit)
4299                // SBC R7, R7, R3 (with borrow)
4300                // Thumb-2 SBC.W: EB67 0703 = SBC.W R7, R7, R3
4301                bytes.extend_from_slice(&0xEB67u16.to_le_bytes());
4302                bytes.extend_from_slice(&0x0703u16.to_le_bytes());
4303                // ORR R4, R4, #1 (set bit 0 of quotient low)
4304                bytes.extend_from_slice(&0xF044u16.to_le_bytes()); // ORR.W R4, R4, #1
4305                bytes.extend_from_slice(&0x0401u16.to_le_bytes());
4306
4307                // === Decrement counter and loop ===
4308                // SUBS.W R12, R12, #1 (decrement loop counter)
4309                // SUBS.W R12, R12, #1: F1BC 0C01
4310                bytes.extend_from_slice(&0xF1BCu16.to_le_bytes());
4311                bytes.extend_from_slice(&0x0C01u16.to_le_bytes());
4312
4313                // BNE back to loop_start
4314                let branch_offset_bytes = bytes.len() - loop_start + 4; // +4 for pipeline
4315                let offset_halfwords = -((branch_offset_bytes / 2) as i16);
4316                let bne_encoding = 0xD100u16 | ((offset_halfwords as u16) & 0xFF);
4317                bytes.extend_from_slice(&bne_encoding.to_le_bytes());
4318
4319                // === Loop done, move quotient to R0:R1 ===
4320                bytes.extend_from_slice(&0x4620u16.to_le_bytes()); // MOV R0, R4
4321                bytes.extend_from_slice(&0x4629u16.to_le_bytes()); // MOV R1, R5
4322
4323                // POP {R4-R7} - restore scratch registers (NO PC — inline code continues)
4324                // 16-bit POP: 1011 110 P rrrrrrrr where P=0 (no PC), r=R4-R7 = 0xF0
4325                // Encoding: 1011 1100 1111 0000 = 0xBCF0
4326                bytes.extend_from_slice(&0xBCF0u16.to_le_bytes());
4327
4328                Ok(bytes)
4329            }
4330
4331            // I64DivS: 64-bit signed division
4332            // Converts to unsigned, divides, then applies sign
4333            // Input: R0:R1 = dividend (signed), R2:R3 = divisor (signed)
4334            // Output: R0:R1 = quotient (signed)
4335            ArmOp::I64DivS {
4336                rdlo: _,
4337                rdhi: _,
4338                rnlo: _,
4339                rnhi: _,
4340                rmlo: _,
4341                rmhi: _,
4342            } => {
4343                let mut bytes = Vec::new();
4344
4345                // PUSH {R4-R11} - save scratch registers (NO LR — inline code)
4346                bytes.extend_from_slice(&0xE92Du16.to_le_bytes());
4347                bytes.extend_from_slice(&0x0FF0u16.to_le_bytes());
4348
4349                // Save result sign in R9: R9 = R1 XOR R3 (sign bit = MSB)
4350                // EOR.W R9, R1, R3
4351                bytes.extend_from_slice(&0xEA81u16.to_le_bytes());
4352                bytes.extend_from_slice(&0x0903u16.to_le_bytes());
4353
4354                // If dividend negative (R1 MSB set), negate it
4355                // TST R1, R1 (check sign)
4356                bytes.extend_from_slice(&0x4209u16.to_le_bytes()); // TST R1, R1
4357                // BPL skip_neg_dividend (+10 bytes = 5 halfwords)
4358                bytes.extend_from_slice(&0xD504u16.to_le_bytes()); // BPL +8
4359
4360                // Negate R0:R1 (64-bit): RSBS R0, R0, #0; SBC R1, R1, R1 LSL #1
4361                // Actually: MVN R0, R0; MVN R1, R1; ADDS R0, R0, #1; ADC R1, R1, #0
4362                bytes.extend_from_slice(&0x43C0u16.to_le_bytes()); // MVNS R0, R0
4363                bytes.extend_from_slice(&0x43C9u16.to_le_bytes()); // MVNS R1, R1
4364                bytes.extend_from_slice(&0x1C40u16.to_le_bytes()); // ADDS R0, R0, #1
4365                bytes.extend_from_slice(&0xF141u16.to_le_bytes()); // ADC.W R1, R1, #0
4366                bytes.extend_from_slice(&0x0100u16.to_le_bytes());
4367
4368                // If divisor negative (R3 MSB set), negate it
4369                bytes.extend_from_slice(&0x421Bu16.to_le_bytes()); // TST R3, R3
4370                bytes.extend_from_slice(&0xD504u16.to_le_bytes()); // BPL +8
4371
4372                // Negate R2:R3
4373                bytes.extend_from_slice(&0x43D2u16.to_le_bytes()); // MVNS R2, R2
4374                bytes.extend_from_slice(&0x43DBu16.to_le_bytes()); // MVNS R3, R3
4375                bytes.extend_from_slice(&0x1C52u16.to_le_bytes()); // ADDS R2, R2, #1
4376                bytes.extend_from_slice(&0xF143u16.to_le_bytes()); // ADC.W R3, R3, #0
4377                bytes.extend_from_slice(&0x0300u16.to_le_bytes());
4378
4379                // === Now do unsigned division (same as I64DivU) ===
4380                // Initialize quotient (R4:R5) = 0
4381                bytes.extend_from_slice(&0x2400u16.to_le_bytes());
4382                bytes.extend_from_slice(&0x2500u16.to_le_bytes());
4383                // Initialize remainder (R6:R7) = 0
4384                bytes.extend_from_slice(&0x2600u16.to_le_bytes());
4385                bytes.extend_from_slice(&0x2700u16.to_le_bytes());
4386                // Initialize loop counter R8 = 64
4387                bytes.extend_from_slice(&0xF04Fu16.to_le_bytes());
4388                bytes.extend_from_slice(&0x0840u16.to_le_bytes());
4389
4390                let loop_start = bytes.len();
4391
4392                // Shift quotient left
4393                bytes.extend_from_slice(&0x006Du16.to_le_bytes()); // LSLS R5, R5, #1
4394                bytes.extend_from_slice(&0xEA45u16.to_le_bytes()); // ORR.W R5, R5, R4, LSR #31
4395                bytes.extend_from_slice(&0x75D4u16.to_le_bytes());
4396                bytes.extend_from_slice(&0x0064u16.to_le_bytes()); // LSLS R4, R4, #1
4397
4398                // Shift remainder left, OR in MSB of dividend
4399                bytes.extend_from_slice(&0x007Fu16.to_le_bytes()); // LSLS R7, R7, #1
4400                bytes.extend_from_slice(&0xEA47u16.to_le_bytes()); // ORR.W R7, R7, R6, LSR #31
4401                bytes.extend_from_slice(&0x77D6u16.to_le_bytes());
4402                bytes.extend_from_slice(&0x0076u16.to_le_bytes()); // LSLS R6, R6, #1
4403                bytes.extend_from_slice(&0xEA46u16.to_le_bytes()); // ORR.W R6, R6, R1, LSR #31
4404                bytes.extend_from_slice(&0x76D1u16.to_le_bytes());
4405
4406                // Shift dividend left
4407                bytes.extend_from_slice(&0x0049u16.to_le_bytes()); // LSLS R1, R1, #1
4408                bytes.extend_from_slice(&0xEA41u16.to_le_bytes()); // ORR.W R1, R1, R0, LSR #31
4409                bytes.extend_from_slice(&0x71D0u16.to_le_bytes());
4410                bytes.extend_from_slice(&0x0040u16.to_le_bytes()); // LSLS R0, R0, #1
4411
4412                // Compare and conditionally subtract
4413                bytes.extend_from_slice(&0x429Fu16.to_le_bytes()); // CMP R7, R3
4414                bytes.extend_from_slice(&0xD802u16.to_le_bytes()); // BHI +4
4415                bytes.extend_from_slice(&0xD306u16.to_le_bytes()); // BCC +12
4416                bytes.extend_from_slice(&0x4296u16.to_le_bytes()); // CMP R6, R2
4417                bytes.extend_from_slice(&0xD304u16.to_le_bytes()); // BCC +4 halfwords
4418
4419                // Subtract and set quotient bit
4420                bytes.extend_from_slice(&0x1AB6u16.to_le_bytes()); // SUBS R6, R6, R2
4421                bytes.extend_from_slice(&0xEB67u16.to_le_bytes()); // SBC.W R7, R7, R3
4422                bytes.extend_from_slice(&0x0703u16.to_le_bytes());
4423                bytes.extend_from_slice(&0xF044u16.to_le_bytes()); // ORR.W R4, R4, #1
4424                bytes.extend_from_slice(&0x0401u16.to_le_bytes());
4425
4426                // Decrement and loop
4427                bytes.extend_from_slice(&0xF1B8u16.to_le_bytes()); // SUB.W R8, R8, #1
4428                bytes.extend_from_slice(&0x0801u16.to_le_bytes());
4429
4430                let branch_offset_bytes = bytes.len() - loop_start + 4;
4431                let offset_halfwords = -((branch_offset_bytes / 2) as i16);
4432                let bne_encoding = 0xD100u16 | ((offset_halfwords as u16) & 0xFF);
4433                bytes.extend_from_slice(&bne_encoding.to_le_bytes());
4434
4435                // Move quotient to R0:R1
4436                bytes.extend_from_slice(&0x4620u16.to_le_bytes()); // MOV R0, R4
4437                bytes.extend_from_slice(&0x4629u16.to_le_bytes()); // MOV R1, R5
4438
4439                // If result should be negative (R9 MSB set), negate R0:R1
4440                bytes.extend_from_slice(&0xF1B9u16.to_le_bytes()); // TST.W R9, R9 (check MSB)
4441                bytes.extend_from_slice(&0x0F00u16.to_le_bytes());
4442                bytes.extend_from_slice(&0xD504u16.to_le_bytes()); // BPL +8 (skip negation)
4443
4444                // Negate result R0:R1
4445                bytes.extend_from_slice(&0x43C0u16.to_le_bytes()); // MVNS R0, R0
4446                bytes.extend_from_slice(&0x43C9u16.to_le_bytes()); // MVNS R1, R1
4447                bytes.extend_from_slice(&0x1C40u16.to_le_bytes()); // ADDS R0, R0, #1
4448                bytes.extend_from_slice(&0xF141u16.to_le_bytes()); // ADC.W R1, R1, #0
4449                bytes.extend_from_slice(&0x0100u16.to_le_bytes());
4450
4451                // POP {R4-R11} - restore scratch registers (NO PC — inline code continues)
4452                bytes.extend_from_slice(&0xE8BDu16.to_le_bytes());
4453                bytes.extend_from_slice(&0x0FF0u16.to_le_bytes());
4454
4455                Ok(bytes)
4456            }
4457
4458            // I64RemU: 64-bit unsigned remainder using binary long division
4459            // Same algorithm as I64DivU but returns remainder instead of quotient
4460            // Input: R0:R1 = dividend, R2:R3 = divisor
4461            // Output: R0:R1 = remainder
4462            ArmOp::I64RemU {
4463                rdlo: _,
4464                rdhi: _,
4465                rnlo: _,
4466                rnhi: _,
4467                rmlo: _,
4468                rmhi: _,
4469            } => {
4470                let mut bytes = Vec::new();
4471
4472                // PUSH {R4-R8} - save scratch registers (NO LR — inline code)
4473                bytes.extend_from_slice(&0xE92Du16.to_le_bytes());
4474                bytes.extend_from_slice(&0x01F0u16.to_le_bytes());
4475
4476                // Initialize quotient (R4:R5) = 0 (computed but not returned)
4477                bytes.extend_from_slice(&0x2400u16.to_le_bytes());
4478                bytes.extend_from_slice(&0x2500u16.to_le_bytes());
4479                // Initialize remainder (R6:R7) = 0
4480                bytes.extend_from_slice(&0x2600u16.to_le_bytes());
4481                bytes.extend_from_slice(&0x2700u16.to_le_bytes());
4482                // Initialize loop counter R8 = 64
4483                bytes.extend_from_slice(&0xF04Fu16.to_le_bytes());
4484                bytes.extend_from_slice(&0x0840u16.to_le_bytes());
4485
4486                let loop_start = bytes.len();
4487
4488                // Shift quotient left (not needed for result, but keeps algorithm same)
4489                bytes.extend_from_slice(&0x006Du16.to_le_bytes()); // LSLS R5, R5, #1
4490                bytes.extend_from_slice(&0xEA45u16.to_le_bytes()); // ORR.W R5, R5, R4, LSR #31
4491                bytes.extend_from_slice(&0x75D4u16.to_le_bytes());
4492                bytes.extend_from_slice(&0x0064u16.to_le_bytes()); // LSLS R4, R4, #1
4493
4494                // Shift remainder left, OR in MSB of dividend
4495                bytes.extend_from_slice(&0x007Fu16.to_le_bytes()); // LSLS R7, R7, #1
4496                bytes.extend_from_slice(&0xEA47u16.to_le_bytes()); // ORR.W R7, R7, R6, LSR #31
4497                bytes.extend_from_slice(&0x77D6u16.to_le_bytes());
4498                bytes.extend_from_slice(&0x0076u16.to_le_bytes()); // LSLS R6, R6, #1
4499                bytes.extend_from_slice(&0xEA46u16.to_le_bytes()); // ORR.W R6, R6, R1, LSR #31
4500                bytes.extend_from_slice(&0x76D1u16.to_le_bytes());
4501
4502                // Shift dividend left
4503                bytes.extend_from_slice(&0x0049u16.to_le_bytes()); // LSLS R1, R1, #1
4504                bytes.extend_from_slice(&0xEA41u16.to_le_bytes()); // ORR.W R1, R1, R0, LSR #31
4505                bytes.extend_from_slice(&0x71D0u16.to_le_bytes());
4506                bytes.extend_from_slice(&0x0040u16.to_le_bytes()); // LSLS R0, R0, #1
4507
4508                // Compare and conditionally subtract
4509                bytes.extend_from_slice(&0x429Fu16.to_le_bytes()); // CMP R7, R3
4510                bytes.extend_from_slice(&0xD802u16.to_le_bytes()); // BHI +4
4511                bytes.extend_from_slice(&0xD306u16.to_le_bytes()); // BCC +12
4512                bytes.extend_from_slice(&0x4296u16.to_le_bytes()); // CMP R6, R2
4513                bytes.extend_from_slice(&0xD304u16.to_le_bytes()); // BCC +4 halfwords
4514
4515                // Subtract and set quotient bit
4516                bytes.extend_from_slice(&0x1AB6u16.to_le_bytes()); // SUBS R6, R6, R2
4517                bytes.extend_from_slice(&0xEB67u16.to_le_bytes()); // SBC.W R7, R7, R3
4518                bytes.extend_from_slice(&0x0703u16.to_le_bytes());
4519                bytes.extend_from_slice(&0xF044u16.to_le_bytes()); // ORR.W R4, R4, #1
4520                bytes.extend_from_slice(&0x0401u16.to_le_bytes());
4521
4522                // Decrement and loop
4523                bytes.extend_from_slice(&0xF1B8u16.to_le_bytes()); // SUB.W R8, R8, #1
4524                bytes.extend_from_slice(&0x0801u16.to_le_bytes());
4525
4526                let branch_offset_bytes = bytes.len() - loop_start + 4;
4527                let offset_halfwords = -((branch_offset_bytes / 2) as i16);
4528                let bne_encoding = 0xD100u16 | ((offset_halfwords as u16) & 0xFF);
4529                bytes.extend_from_slice(&bne_encoding.to_le_bytes());
4530
4531                // Move REMAINDER to R0:R1 (difference from I64DivU)
4532                bytes.extend_from_slice(&0x4630u16.to_le_bytes()); // MOV R0, R6
4533                bytes.extend_from_slice(&0x4639u16.to_le_bytes()); // MOV R1, R7
4534
4535                // POP {R4-R8} - restore scratch registers (NO PC — inline code continues)
4536                bytes.extend_from_slice(&0xE8BDu16.to_le_bytes());
4537                bytes.extend_from_slice(&0x01F0u16.to_le_bytes());
4538
4539                Ok(bytes)
4540            }
4541
4542            // I64RemS: 64-bit signed remainder
4543            // Remainder sign follows dividend sign (not quotient rule)
4544            // Input: R0:R1 = dividend (signed), R2:R3 = divisor (signed)
4545            // Output: R0:R1 = remainder (signed, same sign as dividend)
4546            ArmOp::I64RemS {
4547                rdlo: _,
4548                rdhi: _,
4549                rnlo: _,
4550                rnhi: _,
4551                rmlo: _,
4552                rmhi: _,
4553            } => {
4554                let mut bytes = Vec::new();
4555
4556                // PUSH {R4-R11} - save scratch registers (NO LR — inline code)
4557                bytes.extend_from_slice(&0xE92Du16.to_le_bytes());
4558                bytes.extend_from_slice(&0x0FF0u16.to_le_bytes());
4559
4560                // Save dividend sign in R9 (remainder sign = dividend sign)
4561                // MOV R9, R1 (just need the sign bit)
4562                bytes.extend_from_slice(&0x4689u16.to_le_bytes()); // MOV R9, R1
4563
4564                // If dividend negative (R1 MSB set), negate it
4565                bytes.extend_from_slice(&0x4209u16.to_le_bytes()); // TST R1, R1
4566                bytes.extend_from_slice(&0xD504u16.to_le_bytes()); // BPL +8
4567
4568                // Negate R0:R1
4569                bytes.extend_from_slice(&0x43C0u16.to_le_bytes()); // MVNS R0, R0
4570                bytes.extend_from_slice(&0x43C9u16.to_le_bytes()); // MVNS R1, R1
4571                bytes.extend_from_slice(&0x1C40u16.to_le_bytes()); // ADDS R0, R0, #1
4572                bytes.extend_from_slice(&0xF141u16.to_le_bytes()); // ADC.W R1, R1, #0
4573                bytes.extend_from_slice(&0x0100u16.to_le_bytes());
4574
4575                // If divisor negative (R3 MSB set), negate it
4576                bytes.extend_from_slice(&0x421Bu16.to_le_bytes()); // TST R3, R3
4577                bytes.extend_from_slice(&0xD504u16.to_le_bytes()); // BPL +8
4578
4579                // Negate R2:R3
4580                bytes.extend_from_slice(&0x43D2u16.to_le_bytes()); // MVNS R2, R2
4581                bytes.extend_from_slice(&0x43DBu16.to_le_bytes()); // MVNS R3, R3
4582                bytes.extend_from_slice(&0x1C52u16.to_le_bytes()); // ADDS R2, R2, #1
4583                bytes.extend_from_slice(&0xF143u16.to_le_bytes()); // ADC.W R3, R3, #0
4584                bytes.extend_from_slice(&0x0300u16.to_le_bytes());
4585
4586                // === Unsigned division algorithm ===
4587                // Initialize quotient (R4:R5) = 0
4588                bytes.extend_from_slice(&0x2400u16.to_le_bytes());
4589                bytes.extend_from_slice(&0x2500u16.to_le_bytes());
4590                // Initialize remainder (R6:R7) = 0
4591                bytes.extend_from_slice(&0x2600u16.to_le_bytes());
4592                bytes.extend_from_slice(&0x2700u16.to_le_bytes());
4593                // Initialize loop counter R8 = 64
4594                bytes.extend_from_slice(&0xF04Fu16.to_le_bytes());
4595                bytes.extend_from_slice(&0x0840u16.to_le_bytes());
4596
4597                let loop_start = bytes.len();
4598
4599                // Shift quotient left
4600                bytes.extend_from_slice(&0x006Du16.to_le_bytes()); // LSLS R5, R5, #1
4601                bytes.extend_from_slice(&0xEA45u16.to_le_bytes()); // ORR.W R5, R5, R4, LSR #31
4602                bytes.extend_from_slice(&0x75D4u16.to_le_bytes());
4603                bytes.extend_from_slice(&0x0064u16.to_le_bytes()); // LSLS R4, R4, #1
4604
4605                // Shift remainder left, OR in MSB of dividend
4606                bytes.extend_from_slice(&0x007Fu16.to_le_bytes()); // LSLS R7, R7, #1
4607                bytes.extend_from_slice(&0xEA47u16.to_le_bytes()); // ORR.W R7, R7, R6, LSR #31
4608                bytes.extend_from_slice(&0x77D6u16.to_le_bytes());
4609                bytes.extend_from_slice(&0x0076u16.to_le_bytes()); // LSLS R6, R6, #1
4610                bytes.extend_from_slice(&0xEA46u16.to_le_bytes()); // ORR.W R6, R6, R1, LSR #31
4611                bytes.extend_from_slice(&0x76D1u16.to_le_bytes());
4612
4613                // Shift dividend left
4614                bytes.extend_from_slice(&0x0049u16.to_le_bytes()); // LSLS R1, R1, #1
4615                bytes.extend_from_slice(&0xEA41u16.to_le_bytes()); // ORR.W R1, R1, R0, LSR #31
4616                bytes.extend_from_slice(&0x71D0u16.to_le_bytes());
4617                bytes.extend_from_slice(&0x0040u16.to_le_bytes()); // LSLS R0, R0, #1
4618
4619                // Compare and conditionally subtract
4620                bytes.extend_from_slice(&0x429Fu16.to_le_bytes()); // CMP R7, R3
4621                bytes.extend_from_slice(&0xD802u16.to_le_bytes()); // BHI +4
4622                bytes.extend_from_slice(&0xD306u16.to_le_bytes()); // BCC +12
4623                bytes.extend_from_slice(&0x4296u16.to_le_bytes()); // CMP R6, R2
4624                bytes.extend_from_slice(&0xD304u16.to_le_bytes()); // BCC +4 halfwords
4625
4626                // Subtract and set quotient bit
4627                bytes.extend_from_slice(&0x1AB6u16.to_le_bytes()); // SUBS R6, R6, R2
4628                bytes.extend_from_slice(&0xEB67u16.to_le_bytes()); // SBC.W R7, R7, R3
4629                bytes.extend_from_slice(&0x0703u16.to_le_bytes());
4630                bytes.extend_from_slice(&0xF044u16.to_le_bytes()); // ORR.W R4, R4, #1
4631                bytes.extend_from_slice(&0x0401u16.to_le_bytes());
4632
4633                // Decrement and loop
4634                bytes.extend_from_slice(&0xF1B8u16.to_le_bytes()); // SUB.W R8, R8, #1
4635                bytes.extend_from_slice(&0x0801u16.to_le_bytes());
4636
4637                let branch_offset_bytes = bytes.len() - loop_start + 4;
4638                let offset_halfwords = -((branch_offset_bytes / 2) as i16);
4639                let bne_encoding = 0xD100u16 | ((offset_halfwords as u16) & 0xFF);
4640                bytes.extend_from_slice(&bne_encoding.to_le_bytes());
4641
4642                // Move remainder to R0:R1
4643                bytes.extend_from_slice(&0x4630u16.to_le_bytes()); // MOV R0, R6
4644                bytes.extend_from_slice(&0x4639u16.to_le_bytes()); // MOV R1, R7
4645
4646                // If original dividend was negative (R9 MSB set), negate remainder
4647                bytes.extend_from_slice(&0xF1B9u16.to_le_bytes()); // TST.W R9, R9
4648                bytes.extend_from_slice(&0x0F00u16.to_le_bytes());
4649                bytes.extend_from_slice(&0xD504u16.to_le_bytes()); // BPL +8
4650
4651                // Negate result R0:R1
4652                bytes.extend_from_slice(&0x43C0u16.to_le_bytes()); // MVNS R0, R0
4653                bytes.extend_from_slice(&0x43C9u16.to_le_bytes()); // MVNS R1, R1
4654                bytes.extend_from_slice(&0x1C40u16.to_le_bytes()); // ADDS R0, R0, #1
4655                bytes.extend_from_slice(&0xF141u16.to_le_bytes()); // ADC.W R1, R1, #0
4656                bytes.extend_from_slice(&0x0100u16.to_le_bytes());
4657
4658                // POP {R4-R11} - restore scratch registers (NO PC — inline code continues)
4659                bytes.extend_from_slice(&0xE8BDu16.to_le_bytes());
4660                bytes.extend_from_slice(&0x0FF0u16.to_le_bytes());
4661
4662                Ok(bytes)
4663            }
4664
4665            // === F32 VFP single-precision Thumb-2 encodings ===
4666            // VFP instruction words are identical to ARM32; emit as two LE halfwords.
4667            ArmOp::F32Add { sd, sn, sm } => {
4668                Ok(vfp_to_thumb_bytes(encode_vfp_3reg(0xEE300A00, sd, sn, sm)?))
4669            }
4670            ArmOp::F32Sub { sd, sn, sm } => {
4671                Ok(vfp_to_thumb_bytes(encode_vfp_3reg(0xEE300A40, sd, sn, sm)?))
4672            }
4673            ArmOp::F32Mul { sd, sn, sm } => {
4674                Ok(vfp_to_thumb_bytes(encode_vfp_3reg(0xEE200A00, sd, sn, sm)?))
4675            }
4676            ArmOp::F32Div { sd, sn, sm } => {
4677                Ok(vfp_to_thumb_bytes(encode_vfp_3reg(0xEE800A00, sd, sn, sm)?))
4678            }
4679            ArmOp::F32Abs { sd, sm } => {
4680                Ok(vfp_to_thumb_bytes(encode_vfp_2reg(0xEEB00AC0, sd, sm)?))
4681            }
4682            ArmOp::F32Neg { sd, sm } => {
4683                Ok(vfp_to_thumb_bytes(encode_vfp_2reg(0xEEB10A40, sd, sm)?))
4684            }
4685            ArmOp::F32Sqrt { sd, sm } => {
4686                Ok(vfp_to_thumb_bytes(encode_vfp_2reg(0xEEB10AC0, sd, sm)?))
4687            }
4688
4689            // f32 pseudo-ops — multi-instruction sequences
4690            // FPSCR RMode: 00=nearest, 01=+inf(ceil), 10=-inf(floor), 11=zero(trunc)
4691            ArmOp::F32Ceil { sd, sm } => self.encode_thumb_f32_rounding(sd, sm, 0b01),
4692            ArmOp::F32Floor { sd, sm } => self.encode_thumb_f32_rounding(sd, sm, 0b10),
4693            ArmOp::F32Trunc { sd, sm } => self.encode_thumb_f32_rounding(sd, sm, 0b11),
4694            ArmOp::F32Nearest { sd, sm } => self.encode_thumb_f32_rounding(sd, sm, 0b00),
4695            ArmOp::F32Min { sd, sn, sm } => self.encode_thumb_f32_minmax(sd, sn, sm, true),
4696            ArmOp::F32Max { sd, sn, sm } => self.encode_thumb_f32_minmax(sd, sn, sm, false),
4697            ArmOp::F32Copysign { sd, sn, sm } => self.encode_thumb_f32_copysign(sd, sn, sm),
4698
4699            // f32 comparisons — VCMP + VMRS + MOV #0 + IT + MOV #1
4700            ArmOp::F32Eq { rd, sn, sm } => self.encode_thumb_f32_compare(rd, sn, sm, 0x0),
4701            ArmOp::F32Ne { rd, sn, sm } => self.encode_thumb_f32_compare(rd, sn, sm, 0x1),
4702            ArmOp::F32Lt { rd, sn, sm } => self.encode_thumb_f32_compare(rd, sn, sm, 0x4),
4703            ArmOp::F32Le { rd, sn, sm } => self.encode_thumb_f32_compare(rd, sn, sm, 0x9),
4704            ArmOp::F32Gt { rd, sn, sm } => self.encode_thumb_f32_compare(rd, sn, sm, 0xC),
4705            ArmOp::F32Ge { rd, sn, sm } => self.encode_thumb_f32_compare(rd, sn, sm, 0xA),
4706
4707            ArmOp::F32Const { sd, value } => self.encode_thumb_f32_const(sd, *value),
4708
4709            ArmOp::F32Load { sd, addr } => {
4710                Ok(vfp_to_thumb_bytes(encode_vfp_ldst(0xED900A00, sd, addr)?))
4711            }
4712            ArmOp::F32Store { sd, addr } => {
4713                Ok(vfp_to_thumb_bytes(encode_vfp_ldst(0xED800A00, sd, addr)?))
4714            }
4715
4716            ArmOp::F32ConvertI32S { sd, rm } => self.encode_thumb_f32_convert_i32(sd, rm, true),
4717            ArmOp::F32ConvertI32U { sd, rm } => self.encode_thumb_f32_convert_i32(sd, rm, false),
4718            ArmOp::F32ConvertI64S { .. } | ArmOp::F32ConvertI64U { .. } => {
4719                Err(synth_core::Error::synthesis(
4720                    "F32 i64 conversion not supported (requires register pairs on 32-bit ARM)",
4721                ))
4722            }
4723            ArmOp::F32ReinterpretI32 { sd, rm } => {
4724                Ok(vfp_to_thumb_bytes(encode_vmov_core_sreg(true, sd, rm)?))
4725            }
4726            ArmOp::I32ReinterpretF32 { rd, sm } => {
4727                Ok(vfp_to_thumb_bytes(encode_vmov_core_sreg(false, sm, rd)?))
4728            }
4729            ArmOp::I32TruncF32S { rd, sm } => self.encode_thumb_i32_trunc_f32(rd, sm, true),
4730            ArmOp::I32TruncF32U { rd, sm } => self.encode_thumb_i32_trunc_f32(rd, sm, false),
4731
4732            // === F64 VFP double-precision Thumb-2 encodings ===
4733            // VFP instruction words are identical to ARM32; emit as two LE halfwords.
4734            ArmOp::F64Add { dd, dn, dm } => Ok(vfp_to_thumb_bytes(encode_vfp_3reg_f64(
4735                0xEE300B00, dd, dn, dm,
4736            )?)),
4737            ArmOp::F64Sub { dd, dn, dm } => Ok(vfp_to_thumb_bytes(encode_vfp_3reg_f64(
4738                0xEE300B40, dd, dn, dm,
4739            )?)),
4740            ArmOp::F64Mul { dd, dn, dm } => Ok(vfp_to_thumb_bytes(encode_vfp_3reg_f64(
4741                0xEE200B00, dd, dn, dm,
4742            )?)),
4743            ArmOp::F64Div { dd, dn, dm } => Ok(vfp_to_thumb_bytes(encode_vfp_3reg_f64(
4744                0xEE800B00, dd, dn, dm,
4745            )?)),
4746            ArmOp::F64Abs { dd, dm } => {
4747                Ok(vfp_to_thumb_bytes(encode_vfp_2reg_f64(0xEEB00BC0, dd, dm)?))
4748            }
4749            ArmOp::F64Neg { dd, dm } => {
4750                Ok(vfp_to_thumb_bytes(encode_vfp_2reg_f64(0xEEB10B40, dd, dm)?))
4751            }
4752            ArmOp::F64Sqrt { dd, dm } => {
4753                Ok(vfp_to_thumb_bytes(encode_vfp_2reg_f64(0xEEB10BC0, dd, dm)?))
4754            }
4755
4756            // f64 pseudo-ops
4757            // FPSCR RMode: 00=nearest, 01=+inf(ceil), 10=-inf(floor), 11=zero(trunc)
4758            ArmOp::F64Ceil { dd, dm } => self.encode_thumb_f64_rounding(dd, dm, 0b01),
4759            ArmOp::F64Floor { dd, dm } => self.encode_thumb_f64_rounding(dd, dm, 0b10),
4760            ArmOp::F64Trunc { dd, dm } => self.encode_thumb_f64_rounding(dd, dm, 0b11),
4761            ArmOp::F64Nearest { dd, dm } => self.encode_thumb_f64_rounding(dd, dm, 0b00),
4762            ArmOp::F64Min { dd, dn, dm } => self.encode_thumb_f64_minmax(dd, dn, dm, true),
4763            ArmOp::F64Max { dd, dn, dm } => self.encode_thumb_f64_minmax(dd, dn, dm, false),
4764            ArmOp::F64Copysign { dd, dn, dm } => self.encode_thumb_f64_copysign(dd, dn, dm),
4765
4766            // f64 comparisons
4767            ArmOp::F64Eq { rd, dn, dm } => self.encode_thumb_f64_compare(rd, dn, dm, 0x0),
4768            ArmOp::F64Ne { rd, dn, dm } => self.encode_thumb_f64_compare(rd, dn, dm, 0x1),
4769            ArmOp::F64Lt { rd, dn, dm } => self.encode_thumb_f64_compare(rd, dn, dm, 0x4),
4770            ArmOp::F64Le { rd, dn, dm } => self.encode_thumb_f64_compare(rd, dn, dm, 0x9),
4771            ArmOp::F64Gt { rd, dn, dm } => self.encode_thumb_f64_compare(rd, dn, dm, 0xC),
4772            ArmOp::F64Ge { rd, dn, dm } => self.encode_thumb_f64_compare(rd, dn, dm, 0xA),
4773
4774            ArmOp::F64Const { dd, value } => self.encode_thumb_f64_const(dd, *value),
4775
4776            ArmOp::F64Load { dd, addr } => Ok(vfp_to_thumb_bytes(encode_vfp_ldst_f64(
4777                0xED900B00, dd, addr,
4778            )?)),
4779            ArmOp::F64Store { dd, addr } => Ok(vfp_to_thumb_bytes(encode_vfp_ldst_f64(
4780                0xED800B00, dd, addr,
4781            )?)),
4782
4783            ArmOp::F64ConvertI32S { dd, rm } => self.encode_thumb_f64_convert_i32(dd, rm, true),
4784            ArmOp::F64ConvertI32U { dd, rm } => self.encode_thumb_f64_convert_i32(dd, rm, false),
4785            ArmOp::F64ConvertI64S { .. } | ArmOp::F64ConvertI64U { .. } => {
4786                Err(synth_core::Error::synthesis(
4787                    "F64 i64 conversion not supported (requires register pairs on 32-bit ARM)",
4788                ))
4789            }
4790            ArmOp::F64PromoteF32 { dd, sm } => self.encode_thumb_f64_promote_f32(dd, sm),
4791            ArmOp::F64ReinterpretI64 { dd, rmlo, rmhi } => Ok(vfp_to_thumb_bytes(
4792                encode_vmov_core_dreg(true, dd, rmlo, rmhi)?,
4793            )),
4794            ArmOp::I64ReinterpretF64 { rdlo, rdhi, dm } => Ok(vfp_to_thumb_bytes(
4795                encode_vmov_core_dreg(false, dm, rdlo, rdhi)?,
4796            )),
4797            ArmOp::I64TruncF64S { .. } | ArmOp::I64TruncF64U { .. } => {
4798                Err(synth_core::Error::synthesis(
4799                    "i64 truncation from F64 not supported (requires i64 register pairs on 32-bit ARM)",
4800                ))
4801            }
4802            ArmOp::I32TruncF64S { rd, dm } => self.encode_thumb_i32_trunc_f64(rd, dm, true),
4803            ArmOp::I32TruncF64U { rd, dm } => self.encode_thumb_i32_trunc_f64(rd, dm, false),
4804
4805            // ===== i64 operations: encode as multi-instruction Thumb-2 sequences =====
4806
4807            // I64Add: ADDS rdlo, rnlo, rmlo; ADC.W rdhi, rnhi, rmhi
4808            ArmOp::I64Add {
4809                rdlo,
4810                rdhi,
4811                rnlo,
4812                rnhi,
4813                rmlo,
4814                rmhi,
4815            } => {
4816                let mut bytes = Vec::new();
4817                // ADDS rdlo, rnlo, rmlo (16-bit)
4818                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Adds {
4819                    rd: *rdlo,
4820                    rn: *rnlo,
4821                    op2: Operand2::Reg(*rmlo),
4822                })?);
4823                // ADC.W rdhi, rnhi, rmhi (32-bit)
4824                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Adc {
4825                    rd: *rdhi,
4826                    rn: *rnhi,
4827                    op2: Operand2::Reg(*rmhi),
4828                })?);
4829                Ok(bytes)
4830            }
4831
4832            // I64Sub: SUBS rdlo, rnlo, rmlo; SBC.W rdhi, rnhi, rmhi
4833            ArmOp::I64Sub {
4834                rdlo,
4835                rdhi,
4836                rnlo,
4837                rnhi,
4838                rmlo,
4839                rmhi,
4840            } => {
4841                let mut bytes = Vec::new();
4842                // SUBS rdlo, rnlo, rmlo (16-bit)
4843                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Subs {
4844                    rd: *rdlo,
4845                    rn: *rnlo,
4846                    op2: Operand2::Reg(*rmlo),
4847                })?);
4848                // SBC.W rdhi, rnhi, rmhi (32-bit)
4849                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Sbc {
4850                    rd: *rdhi,
4851                    rn: *rnhi,
4852                    op2: Operand2::Reg(*rmhi),
4853                })?);
4854                Ok(bytes)
4855            }
4856
4857            // I64And: AND rdlo, rnlo, rmlo; AND rdhi, rnhi, rmhi
4858            ArmOp::I64And {
4859                rdlo,
4860                rdhi,
4861                rnlo,
4862                rnhi,
4863                rmlo,
4864                rmhi,
4865            } => {
4866                let mut bytes = Vec::new();
4867                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::And {
4868                    rd: *rdlo,
4869                    rn: *rnlo,
4870                    op2: Operand2::Reg(*rmlo),
4871                })?);
4872                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::And {
4873                    rd: *rdhi,
4874                    rn: *rnhi,
4875                    op2: Operand2::Reg(*rmhi),
4876                })?);
4877                Ok(bytes)
4878            }
4879
4880            // I64Or: ORR rdlo, rnlo, rmlo; ORR rdhi, rnhi, rmhi
4881            ArmOp::I64Or {
4882                rdlo,
4883                rdhi,
4884                rnlo,
4885                rnhi,
4886                rmlo,
4887                rmhi,
4888            } => {
4889                let mut bytes = Vec::new();
4890                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Orr {
4891                    rd: *rdlo,
4892                    rn: *rnlo,
4893                    op2: Operand2::Reg(*rmlo),
4894                })?);
4895                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Orr {
4896                    rd: *rdhi,
4897                    rn: *rnhi,
4898                    op2: Operand2::Reg(*rmhi),
4899                })?);
4900                Ok(bytes)
4901            }
4902
4903            // I64Xor: EOR rdlo, rnlo, rmlo; EOR rdhi, rnhi, rmhi
4904            ArmOp::I64Xor {
4905                rdlo,
4906                rdhi,
4907                rnlo,
4908                rnhi,
4909                rmlo,
4910                rmhi,
4911            } => {
4912                let mut bytes = Vec::new();
4913                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Eor {
4914                    rd: *rdlo,
4915                    rn: *rnlo,
4916                    op2: Operand2::Reg(*rmlo),
4917                })?);
4918                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Eor {
4919                    rd: *rdhi,
4920                    rn: *rnhi,
4921                    op2: Operand2::Reg(*rmhi),
4922                })?);
4923                Ok(bytes)
4924            }
4925
4926            // I64Eqz: ORR scratch, lo, hi; ITE EQ; MOV rd, #1; MOV rd, #0
4927            ArmOp::I64Eqz { rd, rnlo, rnhi } => self.encode_thumb(&ArmOp::I64SetCondZ {
4928                rd: *rd,
4929                rn_lo: *rnlo,
4930                rn_hi: *rnhi,
4931            }),
4932
4933            // I64 comparisons: delegate to I64SetCond
4934            ArmOp::I64Eq {
4935                rd,
4936                rnlo,
4937                rnhi,
4938                rmlo,
4939                rmhi,
4940            } => self.encode_thumb(&ArmOp::I64SetCond {
4941                rd: *rd,
4942                rn_lo: *rnlo,
4943                rn_hi: *rnhi,
4944                rm_lo: *rmlo,
4945                rm_hi: *rmhi,
4946                cond: synth_synthesis::Condition::EQ,
4947            }),
4948
4949            ArmOp::I64Ne {
4950                rd,
4951                rnlo,
4952                rnhi,
4953                rmlo,
4954                rmhi,
4955            } => self.encode_thumb(&ArmOp::I64SetCond {
4956                rd: *rd,
4957                rn_lo: *rnlo,
4958                rn_hi: *rnhi,
4959                rm_lo: *rmlo,
4960                rm_hi: *rmhi,
4961                cond: synth_synthesis::Condition::NE,
4962            }),
4963
4964            ArmOp::I64LtS {
4965                rd,
4966                rnlo,
4967                rnhi,
4968                rmlo,
4969                rmhi,
4970            } => self.encode_thumb(&ArmOp::I64SetCond {
4971                rd: *rd,
4972                rn_lo: *rnlo,
4973                rn_hi: *rnhi,
4974                rm_lo: *rmlo,
4975                rm_hi: *rmhi,
4976                cond: synth_synthesis::Condition::LT,
4977            }),
4978
4979            ArmOp::I64LtU {
4980                rd,
4981                rnlo,
4982                rnhi,
4983                rmlo,
4984                rmhi,
4985            } => self.encode_thumb(&ArmOp::I64SetCond {
4986                rd: *rd,
4987                rn_lo: *rnlo,
4988                rn_hi: *rnhi,
4989                rm_lo: *rmlo,
4990                rm_hi: *rmhi,
4991                cond: synth_synthesis::Condition::LO,
4992            }),
4993
4994            ArmOp::I64LeS {
4995                rd,
4996                rnlo,
4997                rnhi,
4998                rmlo,
4999                rmhi,
5000            } => self.encode_thumb(&ArmOp::I64SetCond {
5001                rd: *rd,
5002                rn_lo: *rnlo,
5003                rn_hi: *rnhi,
5004                rm_lo: *rmlo,
5005                rm_hi: *rmhi,
5006                cond: synth_synthesis::Condition::LE,
5007            }),
5008
5009            ArmOp::I64LeU {
5010                rd,
5011                rnlo,
5012                rnhi,
5013                rmlo,
5014                rmhi,
5015            } => self.encode_thumb(&ArmOp::I64SetCond {
5016                rd: *rd,
5017                rn_lo: *rnlo,
5018                rn_hi: *rnhi,
5019                rm_lo: *rmlo,
5020                rm_hi: *rmhi,
5021                cond: synth_synthesis::Condition::LS,
5022            }),
5023
5024            ArmOp::I64GtS {
5025                rd,
5026                rnlo,
5027                rnhi,
5028                rmlo,
5029                rmhi,
5030            } => self.encode_thumb(&ArmOp::I64SetCond {
5031                rd: *rd,
5032                rn_lo: *rnlo,
5033                rn_hi: *rnhi,
5034                rm_lo: *rmlo,
5035                rm_hi: *rmhi,
5036                cond: synth_synthesis::Condition::GT,
5037            }),
5038
5039            ArmOp::I64GtU {
5040                rd,
5041                rnlo,
5042                rnhi,
5043                rmlo,
5044                rmhi,
5045            } => self.encode_thumb(&ArmOp::I64SetCond {
5046                rd: *rd,
5047                rn_lo: *rnlo,
5048                rn_hi: *rnhi,
5049                rm_lo: *rmlo,
5050                rm_hi: *rmhi,
5051                cond: synth_synthesis::Condition::HI,
5052            }),
5053
5054            ArmOp::I64GeS {
5055                rd,
5056                rnlo,
5057                rnhi,
5058                rmlo,
5059                rmhi,
5060            } => self.encode_thumb(&ArmOp::I64SetCond {
5061                rd: *rd,
5062                rn_lo: *rnlo,
5063                rn_hi: *rnhi,
5064                rm_lo: *rmlo,
5065                rm_hi: *rmhi,
5066                cond: synth_synthesis::Condition::GE,
5067            }),
5068
5069            ArmOp::I64GeU {
5070                rd,
5071                rnlo,
5072                rnhi,
5073                rmlo,
5074                rmhi,
5075            } => self.encode_thumb(&ArmOp::I64SetCond {
5076                rd: *rd,
5077                rn_lo: *rnlo,
5078                rn_hi: *rnhi,
5079                rm_lo: *rmlo,
5080                rm_hi: *rmhi,
5081                cond: synth_synthesis::Condition::HS,
5082            }),
5083
5084            // I64Const: MOVW rdlo, lo16; MOVT rdlo, hi16; MOVW rdhi, lo16_hi; MOVT rdhi, hi16_hi
5085            ArmOp::I64Const { rdlo, rdhi, value } => {
5086                let lo32 = *value as u32;
5087                let hi32 = (*value >> 32) as u32;
5088                let mut bytes = Vec::new();
5089                // Load low 32 bits into rdlo
5090                bytes.extend_from_slice(
5091                    &self.encode_thumb32_movw_raw(reg_to_bits(rdlo), lo32 & 0xFFFF)?,
5092                );
5093                if lo32 > 0xFFFF {
5094                    bytes.extend_from_slice(
5095                        &self.encode_thumb32_movt_raw(reg_to_bits(rdlo), lo32 >> 16)?,
5096                    );
5097                }
5098                // Load high 32 bits into rdhi
5099                bytes.extend_from_slice(
5100                    &self.encode_thumb32_movw_raw(reg_to_bits(rdhi), hi32 & 0xFFFF)?,
5101                );
5102                if hi32 > 0xFFFF {
5103                    bytes.extend_from_slice(
5104                        &self.encode_thumb32_movt_raw(reg_to_bits(rdhi), hi32 >> 16)?,
5105                    );
5106                }
5107                Ok(bytes)
5108            }
5109
5110            // I64Ldr: LDR rdlo, [base, offset]; LDR rdhi, [base, offset+4]
5111            ArmOp::I64Ldr { rdlo, rdhi, addr } => {
5112                let mut bytes = Vec::new();
5113                let offset = if addr.offset < 0 {
5114                    0u32
5115                } else {
5116                    addr.offset as u32
5117                };
5118                bytes.extend_from_slice(&self.encode_thumb32_ldr(rdlo, &addr.base, offset)?);
5119                bytes.extend_from_slice(&self.encode_thumb32_ldr(
5120                    rdhi,
5121                    &addr.base,
5122                    offset.wrapping_add(4),
5123                )?);
5124                Ok(bytes)
5125            }
5126
5127            // I64Str: STR rdlo, [base, offset]; STR rdhi, [base, offset+4]
5128            ArmOp::I64Str { rdlo, rdhi, addr } => {
5129                let mut bytes = Vec::new();
5130                let offset = if addr.offset < 0 {
5131                    0u32
5132                } else {
5133                    addr.offset as u32
5134                };
5135                bytes.extend_from_slice(&self.encode_thumb32_str(rdlo, &addr.base, offset)?);
5136                bytes.extend_from_slice(&self.encode_thumb32_str(
5137                    rdhi,
5138                    &addr.base,
5139                    offset.wrapping_add(4),
5140                )?);
5141                Ok(bytes)
5142            }
5143
5144            // I64ExtendI32S: MOV rdlo, rn; ASR rdhi, rdlo, #31 (sign-extend)
5145            ArmOp::I64ExtendI32S { rdlo, rdhi, rn } => {
5146                let mut bytes = Vec::new();
5147                if rdlo != rn {
5148                    // MOV rdlo, rn (16-bit)
5149                    bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Mov {
5150                        rd: *rdlo,
5151                        op2: Operand2::Reg(*rn),
5152                    })?);
5153                }
5154                // ASR rdhi, rdlo, #31 (sign-extend: fill high word with sign bit)
5155                bytes.extend_from_slice(
5156                    &self.encode_thumb32_shift(rdhi, rdlo, 31, 0b10)?, // ASR type
5157                );
5158                Ok(bytes)
5159            }
5160
5161            // I64ExtendI32U: MOV rdlo, rn; MOV rdhi, #0
5162            ArmOp::I64ExtendI32U { rdlo, rdhi, rn } => {
5163                let mut bytes = Vec::new();
5164                if rdlo != rn {
5165                    // MOV rdlo, rn
5166                    bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Mov {
5167                        rd: *rdlo,
5168                        op2: Operand2::Reg(*rn),
5169                    })?);
5170                }
5171                // MOV rdhi, #0 (16-bit: MOVS Rd, #0)
5172                let rdhi_bits = reg_to_bits(rdhi) as u16;
5173                let instr: u16 = 0x2000 | (rdhi_bits << 8);
5174                bytes.extend_from_slice(&instr.to_le_bytes());
5175                Ok(bytes)
5176            }
5177
5178            // I32WrapI64: MOV rd, rnlo (just take low 32 bits)
5179            ArmOp::I32WrapI64 { rd, rnlo } => {
5180                if rd == rnlo {
5181                    // No-op: already in the right register
5182                    let instr: u16 = 0xBF00; // NOP
5183                    Ok(instr.to_le_bytes().to_vec())
5184                } else {
5185                    // MOV rd, rnlo
5186                    self.encode_thumb(&ArmOp::Mov {
5187                        rd: *rd,
5188                        op2: Operand2::Reg(*rnlo),
5189                    })
5190                }
5191            }
5192
5193            // ===== Helium MVE operations (Thumb-2 encoding) =====
5194            ArmOp::MveLoad { qd, addr } => Ok(vfp_to_thumb_bytes(encode_mve_vldrw(qd, addr))),
5195            ArmOp::MveStore { qd, addr } => Ok(vfp_to_thumb_bytes(encode_mve_vstrw(qd, addr))),
5196            ArmOp::MveConst { qd, bytes } => self.encode_thumb_mve_const(qd, bytes),
5197            ArmOp::MveAnd { qd, qn, qm } => Ok(vfp_to_thumb_bytes(encode_mve_3reg_bitwise(
5198                0xEF000150, qd, qn, qm,
5199            ))),
5200            ArmOp::MveOrr { qd, qn, qm } => Ok(vfp_to_thumb_bytes(encode_mve_3reg_bitwise(
5201                0xEF200150, qd, qn, qm,
5202            ))),
5203            ArmOp::MveEor { qd, qn, qm } => Ok(vfp_to_thumb_bytes(encode_mve_3reg_bitwise(
5204                0xFF000150, qd, qn, qm,
5205            ))),
5206            ArmOp::MveMvn { qd, qm } => {
5207                // VMVN Qd, Qm: 0xFFB005C0 | Qd<<12 | Qm
5208                let qd_enc = qreg_to_num(qd);
5209                let qm_enc = qreg_to_num(qm);
5210                let instr: u32 = 0xFFB005C0 | ((qd_enc * 2) << 12) | (qm_enc * 2);
5211                Ok(vfp_to_thumb_bytes(instr))
5212            }
5213            ArmOp::MveBic { qd, qn, qm } => Ok(vfp_to_thumb_bytes(encode_mve_3reg_bitwise(
5214                0xEF100150, qd, qn, qm,
5215            ))),
5216            ArmOp::MveAddI { qd, qn, qm, size } => {
5217                let sz = mve_size_bits(size);
5218                let base: u32 = 0xEF000840 | (sz << 20);
5219                Ok(vfp_to_thumb_bytes(encode_mve_3reg(base, qd, qn, qm)))
5220            }
5221            ArmOp::MveSubI { qd, qn, qm, size } => {
5222                let sz = mve_size_bits(size);
5223                let base: u32 = 0xFF000840 | (sz << 20);
5224                Ok(vfp_to_thumb_bytes(encode_mve_3reg(base, qd, qn, qm)))
5225            }
5226            ArmOp::MveMulI { qd, qn, qm, size } => {
5227                let sz = mve_size_bits(size);
5228                let base: u32 = 0xEF000950 | (sz << 20);
5229                Ok(vfp_to_thumb_bytes(encode_mve_3reg(base, qd, qn, qm)))
5230            }
5231            ArmOp::MveNegI { qd, qm, size } => {
5232                let sz = mve_size_bits(size);
5233                // VNEG.Sx Qd, Qm
5234                let qd_enc = qreg_to_num(qd);
5235                let qm_enc = qreg_to_num(qm);
5236                let base: u32 = 0xFFB103C0 | (sz << 18);
5237                let instr = base | ((qd_enc * 2) << 12) | (qm_enc * 2);
5238                Ok(vfp_to_thumb_bytes(instr))
5239            }
5240            ArmOp::MveDup { qd, rn, size } => {
5241                let sz = mve_size_bits(size);
5242                let qd_enc = qreg_to_num(qd);
5243                let rn_bits = reg_to_bits(rn);
5244                // VDUP.sz Qd, Rn: EEA0 0B10 variant
5245                // size encoding: 00=32, 01=16, 10=8
5246                let be = match sz {
5247                    0 => 0b00u32, // 8-bit
5248                    1 => 0b01,    // 16-bit
5249                    _ => 0b00,    // 32-bit (default)
5250                };
5251                let instr: u32 = 0xEEA00B10 | ((qd_enc * 2) << 16) | (rn_bits << 12) | (be << 5);
5252                Ok(vfp_to_thumb_bytes(instr))
5253            }
5254            ArmOp::MveExtractLane { rd, qn, lane, size } => {
5255                let qn_enc = qreg_to_num(qn);
5256                let rd_bits = reg_to_bits(rd);
5257                // VMOV.sz Rd, Dn[x] — extract from Q-register lane
5258                // For 32-bit: VMOV Rd, Dn — where Dn is the appropriate D-register
5259                let d_reg = qn_enc * 2 + ((*lane as u32) >> 1);
5260                let lane_in_d = (*lane as u32) & 1;
5261                let _sz = mve_size_bits(size);
5262                // VMOV Rd, Dn[x]: EE10 0B10 for 32-bit
5263                let instr: u32 = 0xEE100B10 | (d_reg << 16) | (rd_bits << 12) | (lane_in_d << 21);
5264                Ok(vfp_to_thumb_bytes(instr))
5265            }
5266            ArmOp::MveInsertLane { qd, rn, lane, size } => {
5267                let qd_enc = qreg_to_num(qd);
5268                let rn_bits = reg_to_bits(rn);
5269                let d_reg = qd_enc * 2 + ((*lane as u32) >> 1);
5270                let lane_in_d = (*lane as u32) & 1;
5271                let _sz = mve_size_bits(size);
5272                // VMOV Dn[x], Rn: EE00 0B10 for 32-bit
5273                let instr: u32 = 0xEE000B10 | (d_reg << 16) | (rn_bits << 12) | (lane_in_d << 21);
5274                Ok(vfp_to_thumb_bytes(instr))
5275            }
5276
5277            // MVE float comparisons — emit VCMP + VPSEL sequence (simplified: just VCMP)
5278            ArmOp::MveCmpEqI { qd, qn, qm, size }
5279            | ArmOp::MveCmpNeI { qd, qn, qm, size }
5280            | ArmOp::MveCmpLtS { qd, qn, qm, size }
5281            | ArmOp::MveCmpLtU { qd, qn, qm, size }
5282            | ArmOp::MveCmpGtS { qd, qn, qm, size }
5283            | ArmOp::MveCmpGtU { qd, qn, qm, size }
5284            | ArmOp::MveCmpLeS { qd, qn, qm, size }
5285            | ArmOp::MveCmpLeU { qd, qn, qm, size }
5286            | ArmOp::MveCmpGeS { qd, qn, qm, size }
5287            | ArmOp::MveCmpGeU { qd, qn, qm, size } => {
5288                // Encode as VADD (placeholder encoding — real implementation
5289                // would use VCMP + VPSEL pair)
5290                let sz = mve_size_bits(size);
5291                let base: u32 = 0xEF000840 | (sz << 20);
5292                Ok(vfp_to_thumb_bytes(encode_mve_3reg(base, qd, qn, qm)))
5293            }
5294
5295            // f32x4 MVE arithmetic
5296            ArmOp::MveAddF32 { qd, qn, qm } => {
5297                // VADD.F32 Qd, Qn, Qm (MVE): 0xEF000D40
5298                Ok(vfp_to_thumb_bytes(encode_mve_3reg(0xEF000D40, qd, qn, qm)))
5299            }
5300            ArmOp::MveSubF32 { qd, qn, qm } => {
5301                // VSUB.F32 Qd, Qn, Qm (MVE): 0xEF200D40
5302                Ok(vfp_to_thumb_bytes(encode_mve_3reg(0xEF200D40, qd, qn, qm)))
5303            }
5304            ArmOp::MveMulF32 { qd, qn, qm } => {
5305                // VMUL.F32 Qd, Qn, Qm (MVE): 0xFF000D50
5306                Ok(vfp_to_thumb_bytes(encode_mve_3reg(0xFF000D50, qd, qn, qm)))
5307            }
5308            ArmOp::MveNegF32 { qd, qm } => {
5309                let qd_enc = qreg_to_num(qd);
5310                let qm_enc = qreg_to_num(qm);
5311                // VNEG.F32 Qd, Qm: FFB907C0
5312                let instr: u32 = 0xFFB907C0 | ((qd_enc * 2) << 12) | (qm_enc * 2);
5313                Ok(vfp_to_thumb_bytes(instr))
5314            }
5315            ArmOp::MveAbsF32 { qd, qm } => {
5316                let qd_enc = qreg_to_num(qd);
5317                let qm_enc = qreg_to_num(qm);
5318                // VABS.F32 Qd, Qm: FFB90740
5319                let instr: u32 = 0xFFB90740 | ((qd_enc * 2) << 12) | (qm_enc * 2);
5320                Ok(vfp_to_thumb_bytes(instr))
5321            }
5322            ArmOp::MveCmpEqF32 { qd, qn, qm }
5323            | ArmOp::MveCmpNeF32 { qd, qn, qm }
5324            | ArmOp::MveCmpLtF32 { qd, qn, qm }
5325            | ArmOp::MveCmpLeF32 { qd, qn, qm }
5326            | ArmOp::MveCmpGtF32 { qd, qn, qm }
5327            | ArmOp::MveCmpGeF32 { qd, qn, qm } => {
5328                // Placeholder: encode as VADD.F32 (real impl needs VCMP.F32 + VPSEL)
5329                Ok(vfp_to_thumb_bytes(encode_mve_3reg(0xEF000D40, qd, qn, qm)))
5330            }
5331            ArmOp::MveDupF32 { qd, rn } => {
5332                let qd_enc = qreg_to_num(qd);
5333                let rn_bits = reg_to_bits(rn);
5334                // VDUP.32 Qd, Rn (same encoding as integer VDUP.32)
5335                let instr: u32 = 0xEEA00B10 | ((qd_enc * 2) << 16) | (rn_bits << 12);
5336                Ok(vfp_to_thumb_bytes(instr))
5337            }
5338            ArmOp::MveExtractLaneF32 { rd, qn, lane } => {
5339                let qn_enc = qreg_to_num(qn);
5340                let rd_bits = reg_to_bits(rd);
5341                // VMOV Rd, Sn where Sn = Q*4 + lane
5342                let s_num = qn_enc * 4 + (*lane as u32);
5343                let (vn, n) = encode_sreg(s_num);
5344                let instr: u32 = 0xEE100A10 | (vn << 16) | (rd_bits << 12) | (n << 7);
5345                Ok(vfp_to_thumb_bytes(instr))
5346            }
5347            ArmOp::MveReplaceLaneF32 { qd, rn, lane } => {
5348                let qd_enc = qreg_to_num(qd);
5349                let rn_bits = reg_to_bits(rn);
5350                // VMOV Sn, Rn where Sn = Q*4 + lane
5351                let s_num = qd_enc * 4 + (*lane as u32);
5352                let (vn, n) = encode_sreg(s_num);
5353                let instr: u32 = 0xEE000A10 | (vn << 16) | (rn_bits << 12) | (n << 7);
5354                Ok(vfp_to_thumb_bytes(instr))
5355            }
5356            ArmOp::MveDivF32 { qd, qn, qm } => {
5357                // Lane-wise: extract 4 S-regs, VDIV, insert back
5358                self.encode_thumb_mve_lane_wise_f32_binop(qd, qn, qm, 0xEE800A00)
5359            }
5360            ArmOp::MveSqrtF32 { qd, qm } => {
5361                // Lane-wise: extract 4 S-regs, VSQRT, insert back
5362                self.encode_thumb_mve_lane_wise_f32_sqrt(qd, qm)
5363            }
5364
5365            // Catch-all for any remaining ops
5366            _ => {
5367                let instr: u16 = 0xBF00; // NOP
5368                Ok(instr.to_le_bytes().to_vec())
5369            }
5370        }
5371    }
5372
5373    // === Thumb-2 VFP multi-instruction helpers ===
5374
5375    /// Encode F32 comparison as Thumb-2: VCMP.F32 + VMRS + MOVS rd,#0 + IT + MOV rd,#1
5376    fn encode_thumb_f32_compare(
5377        &self,
5378        rd: &Reg,
5379        sn: &VfpReg,
5380        sm: &VfpReg,
5381        cond_code: u32,
5382    ) -> Result<Vec<u8>> {
5383        let mut bytes = Vec::new();
5384        let rd_bits = reg_to_bits(rd);
5385
5386        // VCMP.F32 Sn, Sm
5387        let sn_num = vfp_sreg_to_num(sn)?;
5388        let sm_num = vfp_sreg_to_num(sm)?;
5389        let (vd, d) = encode_sreg(sn_num);
5390        let (vm, m) = encode_sreg(sm_num);
5391        let vcmp = 0xEEB40A40 | (d << 22) | (vd << 12) | (m << 5) | vm;
5392        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcmp));
5393
5394        // VMRS APSR_nzcv, FPSCR: 0xEEF1FA10
5395        bytes.extend_from_slice(&vfp_to_thumb_bytes(0xEEF1FA10));
5396
5397        // MOVS Rd, #0 (16-bit): 0010 0 Rd(3) 0000 0000
5398        if rd_bits < 8 {
5399            let movs_zero: u16 = 0x2000 | ((rd_bits as u16) << 8);
5400            bytes.extend_from_slice(&movs_zero.to_le_bytes());
5401        } else {
5402            // MOV.W Rd, #0 (32-bit Thumb-2)
5403            let hw1: u16 = 0xF04F;
5404            let hw2: u16 = (rd_bits as u16) << 8;
5405            bytes.extend_from_slice(&hw1.to_le_bytes());
5406            bytes.extend_from_slice(&hw2.to_le_bytes());
5407        }
5408
5409        // IT<cond> — If-Then for conditional MOV
5410        // IT encoding: 1011 1111 cond(4) mask(4)
5411        // mask = 0x8 for single "then" (IT)
5412        let it: u16 = 0xBF00 | ((cond_code as u16) << 4) | 0x8;
5413        bytes.extend_from_slice(&it.to_le_bytes());
5414
5415        // MOV Rd, #1 (16-bit, conditional due to IT): 0010 0 Rd(3) 0000 0001
5416        if rd_bits < 8 {
5417            let mov_one: u16 = 0x2001 | ((rd_bits as u16) << 8);
5418            bytes.extend_from_slice(&mov_one.to_le_bytes());
5419        } else {
5420            // MOV.W Rd, #1 (32-bit)
5421            let hw1: u16 = 0xF04F;
5422            let hw2: u16 = ((rd_bits as u16) << 8) | 0x01;
5423            bytes.extend_from_slice(&hw1.to_le_bytes());
5424            bytes.extend_from_slice(&hw2.to_le_bytes());
5425        }
5426
5427        Ok(bytes)
5428    }
5429
5430    /// Encode F32 constant load as Thumb-2: MOVW + MOVT + VMOV
5431    fn encode_thumb_f32_const(&self, sd: &VfpReg, value: f32) -> Result<Vec<u8>> {
5432        let mut bytes = Vec::new();
5433        let bits = value.to_bits();
5434        let rt: u32 = 12; // R12/IP as temp
5435
5436        // MOVW R12, #lo16
5437        // Thumb-2 MOVW: 11110 i 10 0100 imm4 | 0 imm3 Rd imm8
5438        let lo16 = bits & 0xFFFF;
5439        let imm4 = (lo16 >> 12) & 0xF;
5440        let i_bit = (lo16 >> 11) & 1;
5441        let imm3 = (lo16 >> 8) & 0x7;
5442        let imm8 = lo16 & 0xFF;
5443        let hw1: u16 = (0xF240 | (i_bit << 10) | imm4) as u16;
5444        let hw2: u16 = ((imm3 << 12) | (rt << 8) | imm8) as u16;
5445        bytes.extend_from_slice(&hw1.to_le_bytes());
5446        bytes.extend_from_slice(&hw2.to_le_bytes());
5447
5448        // MOVT R12, #hi16
5449        let hi16 = (bits >> 16) & 0xFFFF;
5450        let imm4 = (hi16 >> 12) & 0xF;
5451        let i_bit = (hi16 >> 11) & 1;
5452        let imm3 = (hi16 >> 8) & 0x7;
5453        let imm8 = hi16 & 0xFF;
5454        let hw1: u16 = (0xF2C0 | (i_bit << 10) | imm4) as u16;
5455        let hw2: u16 = ((imm3 << 12) | (rt << 8) | imm8) as u16;
5456        bytes.extend_from_slice(&hw1.to_le_bytes());
5457        bytes.extend_from_slice(&hw2.to_le_bytes());
5458
5459        // VMOV Sd, R12
5460        let vmov = encode_vmov_core_sreg(true, sd, &Reg::R12)?;
5461        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov));
5462
5463        Ok(bytes)
5464    }
5465
5466    /// Encode VMOV + VCVT.F32.xS32 as Thumb-2
5467    fn encode_thumb_f32_convert_i32(&self, sd: &VfpReg, rm: &Reg, signed: bool) -> Result<Vec<u8>> {
5468        let mut bytes = Vec::new();
5469
5470        // VMOV Sd, Rm
5471        let vmov = encode_vmov_core_sreg(true, sd, rm)?;
5472        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov));
5473
5474        // VCVT.F32.S32/U32 Sd, Sd
5475        let sd_num = vfp_sreg_to_num(sd)?;
5476        let (vd, d) = encode_sreg(sd_num);
5477        let (vm, m) = encode_sreg(sd_num);
5478        let base = if signed { 0xEEB80A40 } else { 0xEEB80AC0 };
5479        let vcvt = base | (d << 22) | (vd << 12) | (m << 5) | vm;
5480        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt));
5481
5482        Ok(bytes)
5483    }
5484
5485    /// Encode F32 rounding pseudo-op as Thumb-2 via VCVT to integer and back
5486    /// Encode F32 rounding as Thumb-2.
5487    /// `mode`: FPSCR RMode — 0b00=nearest, 0b01=+inf(ceil), 0b10=-inf(floor), 0b11=zero(trunc)
5488    ///
5489    /// For trunc: uses VCVTR.S32.F32 (always truncates).
5490    /// For ceil/floor/nearest: sets FPSCR rounding mode, uses VCVT.S32.F32 (non-R variant),
5491    /// then restores FPSCR.
5492    fn encode_thumb_f32_rounding(&self, sd: &VfpReg, sm: &VfpReg, mode: u8) -> Result<Vec<u8>> {
5493        let mut bytes = Vec::new();
5494        let sm_num = vfp_sreg_to_num(sm)?;
5495        let sd_num = vfp_sreg_to_num(sd)?;
5496        let (vd_s, d_s) = encode_sreg(sd_num);
5497        let (vm_s, m_s) = encode_sreg(sm_num);
5498
5499        if mode == 0b11 {
5500            // Trunc (toward zero): VCVTR.S32.F32 — bit[7]=1, always truncates
5501            let vcvt_to_int = 0xEEBD0AC0 | (d_s << 22) | (vd_s << 12) | (m_s << 5) | vm_s;
5502            bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_int));
5503        } else {
5504            // ceil/floor/nearest: manipulate FPSCR rounding mode
5505            let rt: u32 = 12; // R12/IP as temp
5506
5507            // VMRS R12, FPSCR
5508            let vmrs = 0xEEF10A10 | (rt << 12);
5509            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmrs));
5510
5511            // BIC.W R12, R12, #(3 << 22) — clear RMode bits [23:22]
5512            // Thumb-2 modified immediate for 3<<22 = 0x00C00000:
5513            // BIC.W encoding: 11110 i 0 0001 S Rn | 0 imm3 Rd imm8
5514            // 0x00C00000 = 0x03 shifted left by 22 => Thumb mod-imm: i=0, imm3=0b101, imm8=0x03
5515            let bic_hw1: u16 = 0xF020 | ((rt as u16) & 0xF); // BIC, Rn=R12
5516            let bic_hw2: u16 = (0x05 << 12) | ((rt as u16) << 8) | 0x03;
5517            bytes.extend_from_slice(&bic_hw1.to_le_bytes());
5518            bytes.extend_from_slice(&bic_hw2.to_le_bytes());
5519
5520            // ORR.W R12, R12, #(mode << 22)
5521            if mode != 0 {
5522                let orr_hw1: u16 = 0xF040 | ((rt as u16) & 0xF); // ORR, Rn=R12
5523                let orr_hw2: u16 = (0x05 << 12) | ((rt as u16) << 8) | (mode as u16);
5524                bytes.extend_from_slice(&orr_hw1.to_le_bytes());
5525                bytes.extend_from_slice(&orr_hw2.to_le_bytes());
5526            }
5527
5528            // VMSR FPSCR, R12
5529            let vmsr = 0xEEE10A10 | (rt << 12);
5530            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmsr));
5531
5532            // VCVT.S32.F32 Sd, Sm — non-R variant (bit[7]=0), uses FPSCR rmode
5533            let vcvt_to_int = 0xEEBD0A40 | (d_s << 22) | (vd_s << 12) | (m_s << 5) | vm_s;
5534            bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_int));
5535
5536            // Restore FPSCR: clear rmode bits back to nearest (default)
5537            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmrs));
5538            bytes.extend_from_slice(&bic_hw1.to_le_bytes());
5539            bytes.extend_from_slice(&bic_hw2.to_le_bytes());
5540            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmsr));
5541        }
5542
5543        // VCVT.F32.S32 Sd, Sd (convert integer result back to float)
5544        let (vd2, d2) = encode_sreg(sd_num);
5545        let vcvt_to_float = 0xEEB80A40 | (d2 << 22) | (vd2 << 12) | (d_s << 5) | vd_s;
5546        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_float));
5547
5548        Ok(bytes)
5549    }
5550
5551    /// Encode F32 min/max as Thumb-2: VMOV + VCMP + VMRS + IT + VMOV
5552    fn encode_thumb_f32_minmax(
5553        &self,
5554        sd: &VfpReg,
5555        sn: &VfpReg,
5556        sm: &VfpReg,
5557        is_min: bool,
5558    ) -> Result<Vec<u8>> {
5559        let mut bytes = Vec::new();
5560        let sn_num = vfp_sreg_to_num(sn)?;
5561        let sm_num = vfp_sreg_to_num(sm)?;
5562        let sd_num = vfp_sreg_to_num(sd)?;
5563
5564        // VMOV.F32 Sd, Sn
5565        let (vd, d) = encode_sreg(sd_num);
5566        let (vn, n) = encode_sreg(sn_num);
5567        let vmov_sn = 0xEEB00A40 | (d << 22) | (vd << 12) | (n << 5) | vn;
5568        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov_sn));
5569
5570        // VCMP.F32 Sn, Sm
5571        let (vm, m) = encode_sreg(sm_num);
5572        let vcmp = 0xEEB40A40 | (n << 22) | (vn << 12) | (m << 5) | vm;
5573        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcmp));
5574
5575        // VMRS APSR_nzcv, FPSCR
5576        bytes.extend_from_slice(&vfp_to_thumb_bytes(0xEEF1FA10));
5577
5578        // IT GT (for min) or IT MI (for max)
5579        let cond: u16 = if is_min { 0xC } else { 0x4 };
5580        let it: u16 = 0xBF00 | (cond << 4) | 0x8;
5581        bytes.extend_from_slice(&it.to_le_bytes());
5582
5583        // VMOV{cond}.F32 Sd, Sm — conditional VMOV in IT block
5584        let vmov_sm = 0xEEB00A40 | (d << 22) | (vd << 12) | (m << 5) | vm;
5585        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov_sm));
5586
5587        Ok(bytes)
5588    }
5589
5590    /// Encode F32 copysign as Thumb-2
5591    fn encode_thumb_f32_copysign(&self, sd: &VfpReg, sn: &VfpReg, sm: &VfpReg) -> Result<Vec<u8>> {
5592        let mut bytes = Vec::new();
5593
5594        // VMOV R12, Sm (get sign source bits)
5595        bytes.extend_from_slice(&vfp_to_thumb_bytes(encode_vmov_core_sreg(
5596            false,
5597            sm,
5598            &Reg::R12,
5599        )?));
5600
5601        // VMOV R0, Sn (get magnitude source bits)
5602        bytes.extend_from_slice(&vfp_to_thumb_bytes(encode_vmov_core_sreg(
5603            false,
5604            sn,
5605            &Reg::R0,
5606        )?));
5607
5608        // AND.W R12, R12, #0x80000000
5609        // Thumb-2 modified immediate: 0x80000000 = constant 0x80 with rotation
5610        // Using T1 encoding: 11110 i 0 0000 S Rn | 0 imm3 Rd imm8
5611        // 0x80000000: i=0, imm3=0b001, imm8=0x00 (rotation=4, value=0x80)
5612        // Actually encoding #0x80000000 as modified constant:
5613        // bit pattern 1 followed by 31 zeros: enc = 0b0100_00000000 = 0x0100? No.
5614        // ARM modified immediate: abcdefgh rotated. 0x80000000 = 0x80 ROR 2 = enc 0x0102
5615        // Actually: value = abcdefgh ROR (2*rot). 0x80 = 10000000, ROR 2 gives 0x20000000.
5616        // For 0x80000000: 0x02 ROR 2 = 0x80000000. So imm12 = (1<<8) | 0x02 = 0x102
5617        let hw1: u16 = 0xF000 | 12; // AND.W R12, R12, #modified_const (i=0, Rn=R12)
5618        let hw2: u16 = (0x1 << 12) | (12 << 8) | 0x02; // imm3=1, Rd=R12, imm8=0x02
5619        bytes.extend_from_slice(&hw1.to_le_bytes());
5620        bytes.extend_from_slice(&hw2.to_le_bytes());
5621
5622        // BIC.W R0, R0, #0x80000000 (R0 = register 0, fields are zero)
5623        let hw1: u16 = 0xF020; // BIC.W R0, R0, #modified_const (i=0, Rn=R0)
5624        let hw2: u16 = (0x1 << 12) | 0x02; // imm3=1, Rd=R0, imm8=0x02
5625        bytes.extend_from_slice(&hw1.to_le_bytes());
5626        bytes.extend_from_slice(&hw2.to_le_bytes());
5627
5628        // ORR.W R0, R0, R12 (R0 = register 0)
5629        let hw1: u16 = 0xEA40; // ORR.W R0, R0, R12 (Rn=R0)
5630        let hw2: u16 = 12; // Rd=R0, Rm=R12
5631        bytes.extend_from_slice(&hw1.to_le_bytes());
5632        bytes.extend_from_slice(&hw2.to_le_bytes());
5633
5634        // VMOV Sd, R0
5635        bytes.extend_from_slice(&vfp_to_thumb_bytes(encode_vmov_core_sreg(
5636            true,
5637            sd,
5638            &Reg::R0,
5639        )?));
5640
5641        Ok(bytes)
5642    }
5643
5644    /// Encode F64 comparison as Thumb-2: VCMP.F64 + VMRS + MOV #0 + IT + MOV #1
5645    fn encode_thumb_f64_compare(
5646        &self,
5647        rd: &Reg,
5648        dn: &VfpReg,
5649        dm: &VfpReg,
5650        cond_code: u32,
5651    ) -> Result<Vec<u8>> {
5652        let mut bytes = Vec::new();
5653        let rd_bits = reg_to_bits(rd);
5654
5655        // VCMP.F64 Dn, Dm
5656        let dn_num = vfp_dreg_to_num(dn)?;
5657        let dm_num = vfp_dreg_to_num(dm)?;
5658        let (vd, d) = encode_dreg(dn_num);
5659        let (vm, m) = encode_dreg(dm_num);
5660        let vcmp = 0xEEB40B40 | (d << 22) | (vd << 12) | (m << 5) | vm;
5661        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcmp));
5662
5663        // VMRS APSR_nzcv, FPSCR
5664        bytes.extend_from_slice(&vfp_to_thumb_bytes(0xEEF1FA10));
5665
5666        // MOVS Rd, #0
5667        if rd_bits < 8 {
5668            let movs_zero: u16 = 0x2000 | ((rd_bits as u16) << 8);
5669            bytes.extend_from_slice(&movs_zero.to_le_bytes());
5670        } else {
5671            let hw1: u16 = 0xF04F;
5672            let hw2: u16 = (rd_bits as u16) << 8;
5673            bytes.extend_from_slice(&hw1.to_le_bytes());
5674            bytes.extend_from_slice(&hw2.to_le_bytes());
5675        }
5676
5677        // IT<cond>
5678        let it: u16 = 0xBF00 | ((cond_code as u16) << 4) | 0x8;
5679        bytes.extend_from_slice(&it.to_le_bytes());
5680
5681        // MOV Rd, #1
5682        if rd_bits < 8 {
5683            let mov_one: u16 = 0x2001 | ((rd_bits as u16) << 8);
5684            bytes.extend_from_slice(&mov_one.to_le_bytes());
5685        } else {
5686            let hw1: u16 = 0xF04F;
5687            let hw2: u16 = ((rd_bits as u16) << 8) | 0x01;
5688            bytes.extend_from_slice(&hw1.to_le_bytes());
5689            bytes.extend_from_slice(&hw2.to_le_bytes());
5690        }
5691
5692        Ok(bytes)
5693    }
5694
5695    /// Encode F64 constant load as Thumb-2: MOVW+MOVT (lo32 into R0) + MOVW+MOVT (hi32 into R12) + VMOV Dd, R0, R12
5696    fn encode_thumb_f64_const(&self, dd: &VfpReg, value: f64) -> Result<Vec<u8>> {
5697        let mut bytes = Vec::new();
5698        let bits = value.to_bits();
5699        let lo32 = bits as u32;
5700        let hi32 = (bits >> 32) as u32;
5701
5702        // MOVW R0, #lo16(lo32)
5703        let lo16 = lo32 & 0xFFFF;
5704        bytes.extend_from_slice(&self.encode_thumb32_movw_raw(0, lo16)?);
5705
5706        // MOVT R0, #hi16(lo32)
5707        let hi16 = (lo32 >> 16) & 0xFFFF;
5708        bytes.extend_from_slice(&self.encode_thumb32_movt_raw(0, hi16)?);
5709
5710        // MOVW R12, #lo16(hi32)
5711        let lo16 = hi32 & 0xFFFF;
5712        bytes.extend_from_slice(&self.encode_thumb32_movw_raw(12, lo16)?);
5713
5714        // MOVT R12, #hi16(hi32)
5715        let hi16 = (hi32 >> 16) & 0xFFFF;
5716        bytes.extend_from_slice(&self.encode_thumb32_movt_raw(12, hi16)?);
5717
5718        // VMOV Dd, R0, R12
5719        let vmov = encode_vmov_core_dreg(true, dd, &Reg::R0, &Reg::R12)?;
5720        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov));
5721
5722        Ok(bytes)
5723    }
5724
5725    /// Encode VMOV Sd, Rm + VCVT.F64.S32/U32 Dd, Sd as Thumb-2
5726    fn encode_thumb_f64_convert_i32(&self, dd: &VfpReg, rm: &Reg, signed: bool) -> Result<Vec<u8>> {
5727        let mut bytes = Vec::new();
5728
5729        // VMOV S0, Rm
5730        let vmov = encode_vmov_core_sreg(true, &VfpReg::S0, rm)?;
5731        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov));
5732
5733        // VCVT.F64.S32 Dd, S0 or VCVT.F64.U32 Dd, S0
5734        let dd_num = vfp_dreg_to_num(dd)?;
5735        let (vd, d) = encode_dreg(dd_num);
5736        let base = if signed { 0xEEB80B40 } else { 0xEEB80BC0 };
5737        let vcvt = base | (d << 22) | (vd << 12);
5738        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt));
5739
5740        Ok(bytes)
5741    }
5742
5743    /// Encode VCVT.F64.F32 Dd, Sm as Thumb-2
5744    fn encode_thumb_f64_promote_f32(&self, dd: &VfpReg, sm: &VfpReg) -> Result<Vec<u8>> {
5745        let dd_num = vfp_dreg_to_num(dd)?;
5746        let sm_num = vfp_sreg_to_num(sm)?;
5747        let (vd, d) = encode_dreg(dd_num);
5748        let (vm, m) = encode_sreg(sm_num);
5749
5750        let vcvt = 0xEEB70AC0 | (d << 22) | (vd << 12) | (m << 5) | vm;
5751        Ok(vfp_to_thumb_bytes(vcvt))
5752    }
5753
5754    /// Encode VCVT.S32/U32.F64 S0, Dm + VMOV Rd, S0 as Thumb-2
5755    fn encode_thumb_i32_trunc_f64(&self, rd: &Reg, dm: &VfpReg, signed: bool) -> Result<Vec<u8>> {
5756        let mut bytes = Vec::new();
5757        let dm_num = vfp_dreg_to_num(dm)?;
5758        let (vm, m) = encode_dreg(dm_num);
5759
5760        // VCVT.S32.F64 S0, Dm or VCVT.U32.F64 S0, Dm
5761        let base = if signed { 0xEEBD0BC0 } else { 0xEEBC0BC0 };
5762        let vcvt = base | (m << 5) | vm;
5763        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt));
5764
5765        // VMOV Rd, S0
5766        let vmov = encode_vmov_core_sreg(false, &VfpReg::S0, rd)?;
5767        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov));
5768
5769        Ok(bytes)
5770    }
5771
5772    /// Encode F64 rounding pseudo-op as Thumb-2 via VCVT to integer and back
5773    /// Encode F64 rounding as Thumb-2.
5774    /// `mode`: FPSCR RMode — 0b00=nearest, 0b01=+inf(ceil), 0b10=-inf(floor), 0b11=zero(trunc)
5775    fn encode_thumb_f64_rounding(&self, dd: &VfpReg, dm: &VfpReg, mode: u8) -> Result<Vec<u8>> {
5776        let mut bytes = Vec::new();
5777        let dm_num = vfp_dreg_to_num(dm)?;
5778        let dd_num = vfp_dreg_to_num(dd)?;
5779        let (vm, m) = encode_dreg(dm_num);
5780        let (vd, d) = encode_dreg(dd_num);
5781
5782        if mode == 0b11 {
5783            // Trunc: VCVTR.S32.F64 — bit[7]=1, always truncates
5784            let vcvt_to_int = 0xEEBD0BC0 | (m << 5) | vm;
5785            bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_int));
5786        } else {
5787            let rt: u32 = 12;
5788
5789            // VMRS R12, FPSCR
5790            let vmrs = 0xEEF10A10 | (rt << 12);
5791            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmrs));
5792
5793            // BIC.W R12, R12, #(3 << 22)
5794            let bic_hw1: u16 = 0xF020 | ((rt as u16) & 0xF);
5795            let bic_hw2: u16 = (0x05 << 12) | ((rt as u16) << 8) | 0x03;
5796            bytes.extend_from_slice(&bic_hw1.to_le_bytes());
5797            bytes.extend_from_slice(&bic_hw2.to_le_bytes());
5798
5799            // ORR.W R12, R12, #(mode << 22)
5800            if mode != 0 {
5801                let orr_hw1: u16 = 0xF040 | ((rt as u16) & 0xF);
5802                let orr_hw2: u16 = (0x05 << 12) | ((rt as u16) << 8) | (mode as u16);
5803                bytes.extend_from_slice(&orr_hw1.to_le_bytes());
5804                bytes.extend_from_slice(&orr_hw2.to_le_bytes());
5805            }
5806
5807            // VMSR FPSCR, R12
5808            let vmsr = 0xEEE10A10 | (rt << 12);
5809            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmsr));
5810
5811            // VCVT.S32.F64 S0, Dm — non-R variant (bit[7]=0)
5812            let vcvt_to_int = 0xEEBD0B40 | (m << 5) | vm;
5813            bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_int));
5814
5815            // Restore FPSCR
5816            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmrs));
5817            bytes.extend_from_slice(&bic_hw1.to_le_bytes());
5818            bytes.extend_from_slice(&bic_hw2.to_le_bytes());
5819            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmsr));
5820        }
5821
5822        // VCVT.F64.S32 Dd, S0
5823        let vcvt_to_float = 0xEEB80B40 | (d << 22) | (vd << 12);
5824        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_float));
5825
5826        Ok(bytes)
5827    }
5828
5829    /// Encode F64 min/max as Thumb-2
5830    fn encode_thumb_f64_minmax(
5831        &self,
5832        dd: &VfpReg,
5833        dn: &VfpReg,
5834        dm: &VfpReg,
5835        is_min: bool,
5836    ) -> Result<Vec<u8>> {
5837        let mut bytes = Vec::new();
5838        let dn_num = vfp_dreg_to_num(dn)?;
5839        let dm_num = vfp_dreg_to_num(dm)?;
5840        let dd_num = vfp_dreg_to_num(dd)?;
5841
5842        // VMOV.F64 Dd, Dn
5843        let (vd, d) = encode_dreg(dd_num);
5844        let (vn, n) = encode_dreg(dn_num);
5845        let vmov_dn = 0xEEB00B40 | (d << 22) | (vd << 12) | (n << 5) | vn;
5846        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov_dn));
5847
5848        // VCMP.F64 Dn, Dm
5849        let (vm, m) = encode_dreg(dm_num);
5850        let vcmp = 0xEEB40B40 | (n << 22) | (vn << 12) | (m << 5) | vm;
5851        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcmp));
5852
5853        // VMRS APSR_nzcv, FPSCR
5854        bytes.extend_from_slice(&vfp_to_thumb_bytes(0xEEF1FA10));
5855
5856        // IT GT (for min) or IT MI (for max)
5857        let cond: u16 = if is_min { 0xC } else { 0x4 };
5858        let it: u16 = 0xBF00 | (cond << 4) | 0x8;
5859        bytes.extend_from_slice(&it.to_le_bytes());
5860
5861        // VMOV{cond}.F64 Dd, Dm
5862        let vmov_dm = 0xEEB00B40 | (d << 22) | (vd << 12) | (m << 5) | vm;
5863        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov_dm));
5864
5865        Ok(bytes)
5866    }
5867
5868    /// Encode F64 copysign as Thumb-2
5869    fn encode_thumb_f64_copysign(&self, dd: &VfpReg, dn: &VfpReg, dm: &VfpReg) -> Result<Vec<u8>> {
5870        let mut bytes = Vec::new();
5871
5872        // VMOV R0, R12, Dm (get sign source)
5873        bytes.extend_from_slice(&vfp_to_thumb_bytes(encode_vmov_core_dreg(
5874            false,
5875            dm,
5876            &Reg::R0,
5877            &Reg::R12,
5878        )?));
5879
5880        // VMOV R1, R2, Dn (get magnitude source)
5881        bytes.extend_from_slice(&vfp_to_thumb_bytes(encode_vmov_core_dreg(
5882            false,
5883            dn,
5884            &Reg::R1,
5885            &Reg::R2,
5886        )?));
5887
5888        // AND.W R12, R12, #0x80000000 (i=0, Rn=R12)
5889        let hw1: u16 = 0xF000 | 12;
5890        let hw2: u16 = (0x1 << 12) | (12 << 8) | 0x02;
5891        bytes.extend_from_slice(&hw1.to_le_bytes());
5892        bytes.extend_from_slice(&hw2.to_le_bytes());
5893
5894        // BIC.W R2, R2, #0x80000000 (i=0, Rn=R2)
5895        let hw1: u16 = 0xF020 | 2;
5896        let hw2: u16 = (0x1 << 12) | (2 << 8) | 0x02;
5897        bytes.extend_from_slice(&hw1.to_le_bytes());
5898        bytes.extend_from_slice(&hw2.to_le_bytes());
5899
5900        // ORR.W R2, R2, R12
5901        let hw1: u16 = 0xEA40 | 2;
5902        let hw2: u16 = (2 << 8) | 12;
5903        bytes.extend_from_slice(&hw1.to_le_bytes());
5904        bytes.extend_from_slice(&hw2.to_le_bytes());
5905
5906        // VMOV Dd, R1, R2
5907        bytes.extend_from_slice(&vfp_to_thumb_bytes(encode_vmov_core_dreg(
5908            true,
5909            dd,
5910            &Reg::R1,
5911            &Reg::R2,
5912        )?));
5913
5914        Ok(bytes)
5915    }
5916
5917    /// Encode VCVT.S32/U32.F32 + VMOV as Thumb-2
5918    fn encode_thumb_i32_trunc_f32(&self, rd: &Reg, sm: &VfpReg, signed: bool) -> Result<Vec<u8>> {
5919        let mut bytes = Vec::new();
5920
5921        let sm_num = vfp_sreg_to_num(sm)?;
5922        let (vd, d) = encode_sreg(sm_num);
5923        let (vm, m) = encode_sreg(sm_num);
5924        let base = if signed { 0xEEBD0AC0 } else { 0xEEBC0AC0 };
5925        let vcvt = base | (d << 22) | (vd << 12) | (m << 5) | vm;
5926        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt));
5927
5928        // VMOV Rd, Sm
5929        let vmov = encode_vmov_core_sreg(false, sm, rd)?;
5930        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov));
5931
5932        Ok(bytes)
5933    }
5934
5935    // === Thumb-2 32-bit encoding helpers ===
5936
5937    /// Encode Thumb-2 32-bit ADD with immediate
5938    fn encode_thumb32_add(&self, rd: &Reg, rn: &Reg, imm: u32) -> Result<Vec<u8>> {
5939        let rd_bits = reg_to_bits(rd);
5940        let rn_bits = reg_to_bits(rn);
5941
5942        // ADD.W Rd, Rn, #imm12
5943        // First halfword: 1111 0 i 0 1000 S Rn
5944        // Second halfword: 0 imm3 Rd imm8
5945        let i_bit = (imm >> 11) & 1;
5946        let imm3 = (imm >> 8) & 0x7;
5947        let imm8 = imm & 0xFF;
5948
5949        let hw1: u16 = (0xF100 | (i_bit << 10) | rn_bits) as u16;
5950        let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
5951
5952        let mut bytes = hw1.to_le_bytes().to_vec();
5953        bytes.extend_from_slice(&hw2.to_le_bytes());
5954        Ok(bytes)
5955    }
5956
5957    /// Encode Thumb-2 32-bit SUB with immediate
5958    fn encode_thumb32_sub(&self, rd: &Reg, rn: &Reg, imm: u32) -> Result<Vec<u8>> {
5959        let rd_bits = reg_to_bits(rd);
5960        let rn_bits = reg_to_bits(rn);
5961
5962        let i_bit = (imm >> 11) & 1;
5963        let imm3 = (imm >> 8) & 0x7;
5964        let imm8 = imm & 0xFF;
5965
5966        let hw1: u16 = (0xF1A0 | (i_bit << 10) | rn_bits) as u16;
5967        let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
5968
5969        let mut bytes = hw1.to_le_bytes().to_vec();
5970        bytes.extend_from_slice(&hw2.to_le_bytes());
5971        Ok(bytes)
5972    }
5973
5974    /// Encode Thumb-2 32-bit ADDS with immediate (sets flags)
5975    fn encode_thumb32_adds(&self, rd: &Reg, rn: &Reg, imm: u32) -> Result<Vec<u8>> {
5976        let rd_bits = reg_to_bits(rd);
5977        let rn_bits = reg_to_bits(rn);
5978
5979        let i_bit = (imm >> 11) & 1;
5980        let imm3 = (imm >> 8) & 0x7;
5981        let imm8 = imm & 0xFF;
5982
5983        // ADDS.W Rd, Rn, #imm (with S=1)
5984        // First halfword: 1111 0 i 0 1000 1 Rn = F110 | i<<10 | Rn
5985        let hw1: u16 = (0xF110 | (i_bit << 10) | rn_bits) as u16;
5986        let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
5987
5988        let mut bytes = hw1.to_le_bytes().to_vec();
5989        bytes.extend_from_slice(&hw2.to_le_bytes());
5990        Ok(bytes)
5991    }
5992
5993    /// Encode Thumb-2 32-bit SUBS with immediate (sets flags)
5994    fn encode_thumb32_subs(&self, rd: &Reg, rn: &Reg, imm: u32) -> Result<Vec<u8>> {
5995        let rd_bits = reg_to_bits(rd);
5996        let rn_bits = reg_to_bits(rn);
5997
5998        let i_bit = (imm >> 11) & 1;
5999        let imm3 = (imm >> 8) & 0x7;
6000        let imm8 = imm & 0xFF;
6001
6002        // SUBS.W Rd, Rn, #imm (with S=1)
6003        // First halfword: 1111 0 i 0 1101 1 Rn = F1B0 | i<<10 | Rn
6004        let hw1: u16 = (0xF1B0 | (i_bit << 10) | rn_bits) as u16;
6005        let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
6006
6007        let mut bytes = hw1.to_le_bytes().to_vec();
6008        bytes.extend_from_slice(&hw2.to_le_bytes());
6009        Ok(bytes)
6010    }
6011
6012    /// Encode Thumb-2 32-bit MOVW (16-bit immediate)
6013    ///
6014    /// # Contract (Verus-style)
6015    /// ```text
6016    /// requires rd <= R14
6017    /// ensures result.len() == 4
6018    /// ensures (imm & 0xFFFF) can be reconstructed from the encoding
6019    /// ```
6020    fn encode_thumb32_movw(&self, rd: &Reg, imm: u32) -> Result<Vec<u8>> {
6021        let rd_bits = reg_to_bits(rd);
6022        reg_bits_checked(rd_bits)?;
6023        let imm16 = imm & 0xFFFF;
6024
6025        // MOVW Rd, #imm16
6026        // 1111 0 i 10 0 1 0 0 imm4 | 0 imm3 Rd imm8
6027        let imm4 = (imm16 >> 12) & 0xF;
6028        let i_bit = (imm16 >> 11) & 1;
6029        let imm3 = (imm16 >> 8) & 0x7;
6030        let imm8 = imm16 & 0xFF;
6031
6032        let hw1: u16 = (0xF240 | (i_bit << 10) | imm4) as u16;
6033        let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
6034
6035        let mut bytes = hw1.to_le_bytes().to_vec();
6036        bytes.extend_from_slice(&hw2.to_le_bytes());
6037        encoding_contracts::verify_thumb32(&bytes);
6038        Ok(bytes)
6039    }
6040
6041    /// Encode Thumb-2 32-bit shift with immediate
6042    ///
6043    /// # Contract (Verus-style)
6044    /// ```text
6045    /// requires rd <= R14, rm <= R14
6046    /// ensures result.len() == 4
6047    /// ```
6048    fn encode_thumb32_shift(
6049        &self,
6050        rd: &Reg,
6051        rm: &Reg,
6052        shift: u32,
6053        shift_type: u8,
6054    ) -> Result<Vec<u8>> {
6055        let rd_bits = reg_to_bits(rd);
6056        let rm_bits = reg_to_bits(rm);
6057        reg_bits_checked(rd_bits)?;
6058        reg_bits_checked(rm_bits)?;
6059        let imm5 = shift & 0x1F;
6060        let imm2 = imm5 & 0x3;
6061        let imm3 = (imm5 >> 2) & 0x7;
6062
6063        // MOV.W Rd, Rm, <shift> #imm
6064        // EA4F 0 imm3 Rd imm2 type Rm
6065        let hw1: u16 = 0xEA4F;
6066        let hw2: u16 =
6067            ((imm3 << 12) | (rd_bits << 8) | (imm2 << 6) | ((shift_type as u32) << 4) | rm_bits)
6068                as u16;
6069
6070        let mut bytes = hw1.to_le_bytes().to_vec();
6071        bytes.extend_from_slice(&hw2.to_le_bytes());
6072        Ok(bytes)
6073    }
6074
6075    /// Encode Thumb-2 32-bit shift by register
6076    /// Encoding: 11111010 0xx0 Rn | 1111 Rd 0000 Rm
6077    /// shift_type: 00=LSL, 01=LSR, 10=ASR, 11=ROR
6078    fn encode_thumb32_shift_reg(
6079        &self,
6080        rd: &Reg,
6081        rn: &Reg,
6082        rm: &Reg,
6083        shift_type: u8,
6084    ) -> Result<Vec<u8>> {
6085        let rd_bits = reg_to_bits(rd);
6086        let rn_bits = reg_to_bits(rn);
6087        let rm_bits = reg_to_bits(rm);
6088
6089        // hw1: 1111 1010 0xx0 Rn
6090        let hw1: u16 = (0xFA00 | ((shift_type as u32) << 5) | rn_bits) as u16;
6091        // hw2: 1111 Rd 0000 Rm
6092        let hw2: u16 = (0xF000 | (rd_bits << 8) | rm_bits) as u16;
6093
6094        let mut bytes = hw1.to_le_bytes().to_vec();
6095        bytes.extend_from_slice(&hw2.to_le_bytes());
6096        Ok(bytes)
6097    }
6098
6099    /// Encode Thumb-2 32-bit CMP with immediate
6100    fn encode_thumb32_cmp_imm(&self, rn: &Reg, imm: u32) -> Result<Vec<u8>> {
6101        let rn_bits = reg_to_bits(rn);
6102
6103        let i_bit = (imm >> 11) & 1;
6104        let imm3 = (imm >> 8) & 0x7;
6105        let imm8 = imm & 0xFF;
6106
6107        // CMP.W Rn, #imm
6108        let hw1: u16 = (0xF1B0 | (i_bit << 10) | rn_bits) as u16;
6109        let hw2: u16 = ((imm3 << 12) | 0x0F00 | imm8) as u16;
6110
6111        let mut bytes = hw1.to_le_bytes().to_vec();
6112        bytes.extend_from_slice(&hw2.to_le_bytes());
6113        Ok(bytes)
6114    }
6115
6116    /// Encode Thumb-2 32-bit LDR
6117    fn encode_thumb32_ldr(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6118        let rd_bits = reg_to_bits(rd);
6119        let base_bits = reg_to_bits(base);
6120
6121        // LDR.W Rd, [Rn, #imm12]
6122        let hw1: u16 = (0xF8D0 | base_bits) as u16;
6123        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6124
6125        let mut bytes = hw1.to_le_bytes().to_vec();
6126        bytes.extend_from_slice(&hw2.to_le_bytes());
6127        Ok(bytes)
6128    }
6129
6130    /// Encode Thumb-2 32-bit STR
6131    fn encode_thumb32_str(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6132        let rd_bits = reg_to_bits(rd);
6133        let base_bits = reg_to_bits(base);
6134
6135        // STR.W Rd, [Rn, #imm12]
6136        let hw1: u16 = (0xF8C0 | base_bits) as u16;
6137        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6138
6139        let mut bytes = hw1.to_le_bytes().to_vec();
6140        bytes.extend_from_slice(&hw2.to_le_bytes());
6141        Ok(bytes)
6142    }
6143
6144    /// Encode Thumb-2 32-bit LDR with register offset: LDR.W Rd, [Rn, Rm]
6145    fn encode_thumb32_ldr_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6146        let rd_bits = reg_to_bits(rd);
6147        let base_bits = reg_to_bits(base);
6148        let rm_bits = reg_to_bits(offset_reg);
6149
6150        // LDR.W Rd, [Rn, Rm, LSL #0]
6151        // Encoding: 1111 1000 0101 Rn | Rt 0000 00 imm2 Rm
6152        // imm2 = 00 for no shift (LSL #0)
6153        let hw1: u16 = (0xF850 | base_bits) as u16;
6154        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6155
6156        let mut bytes = hw1.to_le_bytes().to_vec();
6157        bytes.extend_from_slice(&hw2.to_le_bytes());
6158        Ok(bytes)
6159    }
6160
6161    /// Encode Thumb-2 32-bit STR with register offset: STR.W Rd, [Rn, Rm]
6162    fn encode_thumb32_str_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6163        let rd_bits = reg_to_bits(rd);
6164        let base_bits = reg_to_bits(base);
6165        let rm_bits = reg_to_bits(offset_reg);
6166
6167        // STR.W Rd, [Rn, Rm, LSL #0]
6168        // Encoding: 1111 1000 0100 Rn | Rt 0000 00 imm2 Rm
6169        // imm2 = 00 for no shift (LSL #0)
6170        let hw1: u16 = (0xF840 | base_bits) as u16;
6171        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6172
6173        let mut bytes = hw1.to_le_bytes().to_vec();
6174        bytes.extend_from_slice(&hw2.to_le_bytes());
6175        Ok(bytes)
6176    }
6177
6178    // === Sub-word load/store Thumb-2 encoding helpers ===
6179
6180    /// Encode Thumb-2 32-bit LDRB with immediate: LDRB.W Rd, [Rn, #imm12]
6181    fn encode_thumb32_ldrb_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6182        let rd_bits = reg_to_bits(rd);
6183        let base_bits = reg_to_bits(base);
6184        // LDRB.W Rd, [Rn, #imm12]: 1111 1000 1001 Rn | Rt imm12
6185        let hw1: u16 = (0xF890 | base_bits) as u16;
6186        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6187        let mut bytes = hw1.to_le_bytes().to_vec();
6188        bytes.extend_from_slice(&hw2.to_le_bytes());
6189        Ok(bytes)
6190    }
6191
6192    /// Encode Thumb-2 32-bit LDRB with register: LDRB.W Rd, [Rn, Rm]
6193    fn encode_thumb32_ldrb_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6194        let rd_bits = reg_to_bits(rd);
6195        let base_bits = reg_to_bits(base);
6196        let rm_bits = reg_to_bits(offset_reg);
6197        // LDRB.W Rd, [Rn, Rm, LSL #0]: 1111 1000 0001 Rn | Rt 0000 00 imm2 Rm
6198        let hw1: u16 = (0xF810 | base_bits) as u16;
6199        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6200        let mut bytes = hw1.to_le_bytes().to_vec();
6201        bytes.extend_from_slice(&hw2.to_le_bytes());
6202        Ok(bytes)
6203    }
6204
6205    /// Encode Thumb-2 32-bit LDRSB with immediate: LDRSB.W Rd, [Rn, #imm12]
6206    fn encode_thumb32_ldrsb_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6207        let rd_bits = reg_to_bits(rd);
6208        let base_bits = reg_to_bits(base);
6209        // LDRSB.W Rd, [Rn, #imm12]: 1111 1001 1001 Rn | Rt imm12
6210        let hw1: u16 = (0xF990 | base_bits) as u16;
6211        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6212        let mut bytes = hw1.to_le_bytes().to_vec();
6213        bytes.extend_from_slice(&hw2.to_le_bytes());
6214        Ok(bytes)
6215    }
6216
6217    /// Encode Thumb-2 32-bit LDRSB with register: LDRSB.W Rd, [Rn, Rm]
6218    fn encode_thumb32_ldrsb_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6219        let rd_bits = reg_to_bits(rd);
6220        let base_bits = reg_to_bits(base);
6221        let rm_bits = reg_to_bits(offset_reg);
6222        // LDRSB.W Rd, [Rn, Rm, LSL #0]: 1111 1001 0001 Rn | Rt 0000 00 imm2 Rm
6223        let hw1: u16 = (0xF910 | base_bits) as u16;
6224        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6225        let mut bytes = hw1.to_le_bytes().to_vec();
6226        bytes.extend_from_slice(&hw2.to_le_bytes());
6227        Ok(bytes)
6228    }
6229
6230    /// Encode Thumb-2 32-bit LDRH with immediate: LDRH.W Rd, [Rn, #imm12]
6231    fn encode_thumb32_ldrh_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6232        let rd_bits = reg_to_bits(rd);
6233        let base_bits = reg_to_bits(base);
6234        // LDRH.W Rd, [Rn, #imm12]: 1111 1000 1011 Rn | Rt imm12
6235        let hw1: u16 = (0xF8B0 | base_bits) as u16;
6236        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6237        let mut bytes = hw1.to_le_bytes().to_vec();
6238        bytes.extend_from_slice(&hw2.to_le_bytes());
6239        Ok(bytes)
6240    }
6241
6242    /// Encode Thumb-2 32-bit LDRH with register: LDRH.W Rd, [Rn, Rm]
6243    fn encode_thumb32_ldrh_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6244        let rd_bits = reg_to_bits(rd);
6245        let base_bits = reg_to_bits(base);
6246        let rm_bits = reg_to_bits(offset_reg);
6247        // LDRH.W Rd, [Rn, Rm, LSL #0]: 1111 1000 0011 Rn | Rt 0000 00 imm2 Rm
6248        let hw1: u16 = (0xF830 | base_bits) as u16;
6249        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6250        let mut bytes = hw1.to_le_bytes().to_vec();
6251        bytes.extend_from_slice(&hw2.to_le_bytes());
6252        Ok(bytes)
6253    }
6254
6255    /// Encode Thumb-2 32-bit LDRSH with immediate: LDRSH.W Rd, [Rn, #imm12]
6256    fn encode_thumb32_ldrsh_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6257        let rd_bits = reg_to_bits(rd);
6258        let base_bits = reg_to_bits(base);
6259        // LDRSH.W Rd, [Rn, #imm12]: 1111 1001 1011 Rn | Rt imm12
6260        let hw1: u16 = (0xF9B0 | base_bits) as u16;
6261        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6262        let mut bytes = hw1.to_le_bytes().to_vec();
6263        bytes.extend_from_slice(&hw2.to_le_bytes());
6264        Ok(bytes)
6265    }
6266
6267    /// Encode Thumb-2 32-bit LDRSH with register: LDRSH.W Rd, [Rn, Rm]
6268    fn encode_thumb32_ldrsh_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6269        let rd_bits = reg_to_bits(rd);
6270        let base_bits = reg_to_bits(base);
6271        let rm_bits = reg_to_bits(offset_reg);
6272        // LDRSH.W Rd, [Rn, Rm, LSL #0]: 1111 1001 0011 Rn | Rt 0000 00 imm2 Rm
6273        let hw1: u16 = (0xF930 | base_bits) as u16;
6274        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6275        let mut bytes = hw1.to_le_bytes().to_vec();
6276        bytes.extend_from_slice(&hw2.to_le_bytes());
6277        Ok(bytes)
6278    }
6279
6280    /// Encode Thumb-2 32-bit STRB with immediate: STRB.W Rd, [Rn, #imm12]
6281    fn encode_thumb32_strb_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6282        let rd_bits = reg_to_bits(rd);
6283        let base_bits = reg_to_bits(base);
6284        // STRB.W Rd, [Rn, #imm12]: 1111 1000 1000 Rn | Rt imm12
6285        let hw1: u16 = (0xF880 | base_bits) as u16;
6286        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6287        let mut bytes = hw1.to_le_bytes().to_vec();
6288        bytes.extend_from_slice(&hw2.to_le_bytes());
6289        Ok(bytes)
6290    }
6291
6292    /// Encode Thumb-2 32-bit STRB with register: STRB.W Rd, [Rn, Rm]
6293    fn encode_thumb32_strb_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6294        let rd_bits = reg_to_bits(rd);
6295        let base_bits = reg_to_bits(base);
6296        let rm_bits = reg_to_bits(offset_reg);
6297        // STRB.W Rd, [Rn, Rm, LSL #0]: 1111 1000 0000 Rn | Rt 0000 00 imm2 Rm
6298        let hw1: u16 = (0xF800 | base_bits) as u16;
6299        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6300        let mut bytes = hw1.to_le_bytes().to_vec();
6301        bytes.extend_from_slice(&hw2.to_le_bytes());
6302        Ok(bytes)
6303    }
6304
6305    /// Encode Thumb-2 32-bit STRH with immediate: STRH.W Rd, [Rn, #imm12]
6306    fn encode_thumb32_strh_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6307        let rd_bits = reg_to_bits(rd);
6308        let base_bits = reg_to_bits(base);
6309        // STRH.W Rd, [Rn, #imm12]: 1111 1000 1010 Rn | Rt imm12
6310        let hw1: u16 = (0xF8A0 | base_bits) as u16;
6311        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6312        let mut bytes = hw1.to_le_bytes().to_vec();
6313        bytes.extend_from_slice(&hw2.to_le_bytes());
6314        Ok(bytes)
6315    }
6316
6317    /// Encode Thumb-2 32-bit STRH with register: STRH.W Rd, [Rn, Rm]
6318    fn encode_thumb32_strh_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6319        let rd_bits = reg_to_bits(rd);
6320        let base_bits = reg_to_bits(base);
6321        let rm_bits = reg_to_bits(offset_reg);
6322        // STRH.W Rd, [Rn, Rm, LSL #0]: 1111 1000 0010 Rn | Rt 0000 00 imm2 Rm
6323        let hw1: u16 = (0xF820 | base_bits) as u16;
6324        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6325        let mut bytes = hw1.to_le_bytes().to_vec();
6326        bytes.extend_from_slice(&hw2.to_le_bytes());
6327        Ok(bytes)
6328    }
6329
6330    /// Encode Thumb-2 32-bit ADD with immediate: ADD.W Rd, Rn, #imm
6331    fn encode_thumb32_add_imm(&self, rd: &Reg, rn: &Reg, imm: u32) -> Result<Vec<u8>> {
6332        let rd_bits = reg_to_bits(rd);
6333        let rn_bits = reg_to_bits(rn);
6334
6335        // For small immediates, use ADD.W Rd, Rn, #imm12
6336        // Encoding: 1111 0 i 0 1 0 0 0 S Rn | 0 imm3 Rd imm8
6337        // S = 0 (don't update flags)
6338        // The 12-bit immediate is encoded as: i:imm3:imm8
6339        // For simplicity, we only support imm <= 0xFFF (direct encoding)
6340        if imm <= 0xFFF {
6341            let i_bit = (imm >> 11) & 1;
6342            let imm3 = (imm >> 8) & 0x7;
6343            let imm8 = imm & 0xFF;
6344
6345            let hw1: u16 = (0xF100 | (i_bit << 10) | rn_bits) as u16;
6346            let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
6347
6348            let mut bytes = hw1.to_le_bytes().to_vec();
6349            bytes.extend_from_slice(&hw2.to_le_bytes());
6350            Ok(bytes)
6351        } else {
6352            // For larger immediates, would need MOVW/MOVT + ADD
6353            // For now, return error
6354            Err(synth_core::Error::synthesis(
6355                "ADD immediate too large for single instruction",
6356            ))
6357        }
6358    }
6359
6360    // === Raw encoding helpers for POPCNT (take register numbers directly) ===
6361
6362    /// Encode Thumb-2 32-bit MOVW (16-bit immediate) - raw version
6363    ///
6364    /// # Contract (Verus-style)
6365    /// ```text
6366    /// requires rd <= 14, imm16 <= 0xFFFF
6367    /// ensures result.len() == 4
6368    /// ```
6369    fn encode_thumb32_movw_raw(&self, rd: u32, imm16: u32) -> Result<Vec<u8>> {
6370        reg_bits_checked(rd)?;
6371        encoding_contracts::verify_imm16(imm16);
6372        // MOVW Rd, #imm16
6373        // 1111 0 i 10 0 1 0 0 imm4 | 0 imm3 Rd imm8
6374        let imm16 = imm16 & 0xFFFF;
6375        let imm4 = (imm16 >> 12) & 0xF;
6376        let i_bit = (imm16 >> 11) & 1;
6377        let imm3 = (imm16 >> 8) & 0x7;
6378        let imm8 = imm16 & 0xFF;
6379
6380        let hw1: u16 = (0xF240 | (i_bit << 10) | imm4) as u16;
6381        let hw2: u16 = ((imm3 << 12) | (rd << 8) | imm8) as u16;
6382
6383        let mut bytes = hw1.to_le_bytes().to_vec();
6384        bytes.extend_from_slice(&hw2.to_le_bytes());
6385        encoding_contracts::verify_thumb32(&bytes);
6386        Ok(bytes)
6387    }
6388
6389    /// Encode Thumb-2 32-bit MOVT (move top 16 bits) - raw version
6390    ///
6391    /// # Contract (Verus-style)
6392    /// ```text
6393    /// requires rd <= 14, imm16 <= 0xFFFF
6394    /// ensures result.len() == 4
6395    /// ```
6396    fn encode_thumb32_movt_raw(&self, rd: u32, imm16: u32) -> Result<Vec<u8>> {
6397        reg_bits_checked(rd)?;
6398        encoding_contracts::verify_imm16(imm16);
6399        // MOVT Rd, #imm16
6400        // 1111 0 i 10 1 1 0 0 imm4 | 0 imm3 Rd imm8
6401        let imm16 = imm16 & 0xFFFF;
6402        let imm4 = (imm16 >> 12) & 0xF;
6403        let i_bit = (imm16 >> 11) & 1;
6404        let imm3 = (imm16 >> 8) & 0x7;
6405        let imm8 = imm16 & 0xFF;
6406
6407        let hw1: u16 = (0xF2C0 | (i_bit << 10) | imm4) as u16;
6408        let hw2: u16 = ((imm3 << 12) | (rd << 8) | imm8) as u16;
6409
6410        let mut bytes = hw1.to_le_bytes().to_vec();
6411        bytes.extend_from_slice(&hw2.to_le_bytes());
6412        encoding_contracts::verify_thumb32(&bytes);
6413        Ok(bytes)
6414    }
6415
6416    /// Encode Thumb-2 32-bit LSR (logical shift right) with immediate - raw version
6417    fn encode_thumb32_lsr_raw(&self, rd: u32, rm: u32, shift: u32) -> Result<Vec<u8>> {
6418        // MOV.W Rd, Rm, LSR #imm
6419        // EA4F 0 imm3 Rd imm2 01 Rm
6420        let imm5 = shift & 0x1F;
6421        let imm2 = imm5 & 0x3;
6422        let imm3 = (imm5 >> 2) & 0x7;
6423
6424        let hw1: u16 = 0xEA4F;
6425        let hw2: u16 = ((imm3 << 12) | (rd << 8) | (imm2 << 6) | (0b01 << 4) | rm) as u16;
6426
6427        let mut bytes = hw1.to_le_bytes().to_vec();
6428        bytes.extend_from_slice(&hw2.to_le_bytes());
6429        Ok(bytes)
6430    }
6431
6432    /// Encode Thumb-2 32-bit AND (register) - raw version
6433    fn encode_thumb32_and_reg_raw(&self, rd: u32, rn: u32, rm: u32) -> Result<Vec<u8>> {
6434        // AND.W Rd, Rn, Rm
6435        // EA00 Rn | 0 Rd 00 00 Rm
6436        let hw1: u16 = (0xEA00 | rn) as u16;
6437        let hw2: u16 = ((rd << 8) | rm) as u16;
6438
6439        let mut bytes = hw1.to_le_bytes().to_vec();
6440        bytes.extend_from_slice(&hw2.to_le_bytes());
6441        Ok(bytes)
6442    }
6443
6444    /// Encode Thumb-2 32-bit AND with immediate - raw version
6445    fn encode_thumb32_and_imm_raw(&self, rd: u32, rn: u32, imm: u32) -> Result<Vec<u8>> {
6446        // AND.W Rd, Rn, #<modified_immediate>
6447        // For small immediates (0-255), the encoding is simpler
6448        // F0 00 Rn | 0 imm3 Rd imm8
6449        let i_bit = (imm >> 11) & 1;
6450        let imm3 = (imm >> 8) & 0x7;
6451        let imm8 = imm & 0xFF;
6452
6453        let hw1: u16 = (0xF000 | (i_bit << 10) | rn) as u16;
6454        let hw2: u16 = ((imm3 << 12) | (rd << 8) | imm8) as u16;
6455
6456        let mut bytes = hw1.to_le_bytes().to_vec();
6457        bytes.extend_from_slice(&hw2.to_le_bytes());
6458        Ok(bytes)
6459    }
6460
6461    /// Encode Thumb-2 32-bit SUB (register) - raw version
6462    fn encode_thumb32_sub_reg_raw(&self, rd: u32, rn: u32, rm: u32) -> Result<Vec<u8>> {
6463        // SUB.W Rd, Rn, Rm
6464        // EBA0 Rn | 0 Rd 00 00 Rm
6465        let hw1: u16 = (0xEBA0 | rn) as u16;
6466        let hw2: u16 = ((rd << 8) | rm) as u16;
6467
6468        let mut bytes = hw1.to_le_bytes().to_vec();
6469        bytes.extend_from_slice(&hw2.to_le_bytes());
6470        Ok(bytes)
6471    }
6472
6473    /// Encode Thumb-2 32-bit ADD (register) - raw version
6474    fn encode_thumb32_add_reg_raw(&self, rd: u32, rn: u32, rm: u32) -> Result<Vec<u8>> {
6475        // ADD.W Rd, Rn, Rm
6476        // EB00 Rn | 0 Rd 00 00 Rm
6477        let hw1: u16 = (0xEB00 | rn) as u16;
6478        let hw2: u16 = ((rd << 8) | rm) as u16;
6479
6480        let mut bytes = hw1.to_le_bytes().to_vec();
6481        bytes.extend_from_slice(&hw2.to_le_bytes());
6482        Ok(bytes)
6483    }
6484
6485    /// Encode Thumb-2 32-bit ADDS (register, flag-setting) - raw version.
6486    /// Used as the high-register fallback for `ArmOp::Adds` (i64 low-word add)
6487    /// so R8-R11 pair operands don't overflow the 16-bit field — #178/#180.
6488    fn encode_thumb32_adds_reg_raw(&self, rd: u32, rn: u32, rm: u32) -> Result<Vec<u8>> {
6489        // ADDS.W Rd, Rn, Rm (T3, S=1): EB10 Rn | 0 Rd 00 00 Rm
6490        let hw1: u16 = (0xEB10 | rn) as u16;
6491        let hw2: u16 = ((rd << 8) | rm) as u16;
6492        let mut bytes = hw1.to_le_bytes().to_vec();
6493        bytes.extend_from_slice(&hw2.to_le_bytes());
6494        Ok(bytes)
6495    }
6496
6497    /// Encode Thumb-2 32-bit SUBS (register, flag-setting) - raw version.
6498    /// High-register fallback for `ArmOp::Subs` (i64 low-word subtract) — #178/#180.
6499    fn encode_thumb32_subs_reg_raw(&self, rd: u32, rn: u32, rm: u32) -> Result<Vec<u8>> {
6500        // SUBS.W Rd, Rn, Rm (T3, S=1): EBB0 Rn | 0 Rd 00 00 Rm
6501        let hw1: u16 = (0xEBB0 | rn) as u16;
6502        let hw2: u16 = ((rd << 8) | rm) as u16;
6503        let mut bytes = hw1.to_le_bytes().to_vec();
6504        bytes.extend_from_slice(&hw2.to_le_bytes());
6505        Ok(bytes)
6506    }
6507
6508    /// Encode a sequence of ARM instructions
6509    pub fn encode_sequence(&self, ops: &[ArmOp]) -> Result<Vec<u8>> {
6510        let mut code = Vec::new();
6511
6512        for op in ops {
6513            let encoded = self.encode(op)?;
6514            code.extend_from_slice(&encoded);
6515        }
6516
6517        Ok(code)
6518    }
6519}
6520
6521/// Convert register to bit encoding (0-15)
6522fn reg_to_bits(reg: &Reg) -> u32 {
6523    match reg {
6524        Reg::R0 => 0,
6525        Reg::R1 => 1,
6526        Reg::R2 => 2,
6527        Reg::R3 => 3,
6528        Reg::R4 => 4,
6529        Reg::R5 => 5,
6530        Reg::R6 => 6,
6531        Reg::R7 => 7,
6532        Reg::R8 => 8,
6533        Reg::R9 => 9,
6534        Reg::R10 => 10,
6535        Reg::R11 => 11,
6536        Reg::R12 => 12,
6537        Reg::SP => 13,
6538        Reg::LR => 14,
6539        Reg::PC => 15,
6540    }
6541}
6542
6543/// Fallible form of the `verify_reg_bits` contract. PC (R15) is not a valid
6544/// data operand for the Thumb-2 encodings that use this guard (SDIV/UDIV/MLS/…
6545/// are UNPREDICTABLE with PC). Synth's own codegen never emits PC there, but
6546/// the encoder must stay *total* over arbitrary `ArmOp` inputs — the fuzz
6547/// harness (`encoder_no_panic`) requires Ok-or-Err, never a panic. Pre-fix, the
6548/// `debug_assert` in `verify_reg_bits` aborted under `-Cdebug-assertions`.
6549/// Returns a typed Err instead. See #185.
6550fn reg_bits_checked(bits: u32) -> Result<()> {
6551    if bits > 14 {
6552        return Err(synth_core::Error::synthesis(format!(
6553            "register bits {bits} (PC/R15) is not a valid operand for this Thumb-2 encoding"
6554        )));
6555    }
6556    Ok(())
6557}
6558
6559/// Try to encode a 32-bit value as an ARM rotated immediate (imm8 ROR 2*rot4).
6560/// Returns Some((encoded_bits, 1)) if representable, None otherwise.
6561fn try_encode_rotated_imm(val: u32) -> Option<(u32, u32)> {
6562    if val == 0 {
6563        return Some((0, 1));
6564    }
6565    for rot in 0..16u32 {
6566        let shift = rot * 2;
6567        // Rotate left by shift (undo the ROR) to see if result fits in 8 bits
6568        let unrotated = val.rotate_left(shift);
6569        if unrotated <= 0xFF {
6570            // Encoded as: rot4(4 bits) | imm8(8 bits) = rotate_imm << 8 | imm8
6571            return Some(((rot << 8) | unrotated, 1));
6572        }
6573    }
6574    None
6575}
6576
6577/// Encode operand2 field and return (bits, immediate_flag).
6578/// For ARM32 mode, immediates use the rotated-immediate encoding (imm8 ROR 2*rot4).
6579/// Panics if an immediate value cannot be represented. Callers that need large
6580/// immediates should use MOVW/MOVT instead of Operand2::Imm.
6581fn encode_operand2(op2: &Operand2) -> (u32, u32) {
6582    match op2 {
6583        Operand2::Imm(val) => {
6584            let uval = *val as u32;
6585            // Attempt rotated-immediate encoding (ARM32 Operand2)
6586            if let Some(encoded) = try_encode_rotated_imm(uval) {
6587                encoded
6588            } else {
6589                // Fallback: mask to 8 bits (legacy behavior for values that
6590                // cannot be represented). This should not be reached for
6591                // correctly-selected instructions; the instruction selector
6592                // must use MOVW/MOVT for large constants.
6593                let imm = uval & 0xFF;
6594                (imm, 1)
6595            }
6596        }
6597
6598        Operand2::Reg(reg) => {
6599            let reg_bits = reg_to_bits(reg);
6600            (reg_bits, 0) // I=0 for register
6601        }
6602
6603        Operand2::RegShift {
6604            rm,
6605            shift: _,
6606            amount,
6607        } => {
6608            // Simplified encoding with shift
6609            let rm_bits = reg_to_bits(rm);
6610            let shift_bits = (*amount & 0x1F) << 7;
6611            (shift_bits | rm_bits, 0)
6612        }
6613    }
6614}
6615
6616/// Encode memory address to (base_reg, offset)
6617fn encode_mem_addr(addr: &MemAddr) -> (u32, u32) {
6618    let base_bits = reg_to_bits(&addr.base);
6619    let offset_bits = (addr.offset as u32) & 0xFFF; // 12-bit offset
6620    (base_bits, offset_bits)
6621}
6622
6623/// S-register number: S0=0, S1=1, ..., S31=31
6624fn vfp_sreg_to_num(reg: &VfpReg) -> Result<u32> {
6625    match reg {
6626        VfpReg::S0 => Ok(0),
6627        VfpReg::S1 => Ok(1),
6628        VfpReg::S2 => Ok(2),
6629        VfpReg::S3 => Ok(3),
6630        VfpReg::S4 => Ok(4),
6631        VfpReg::S5 => Ok(5),
6632        VfpReg::S6 => Ok(6),
6633        VfpReg::S7 => Ok(7),
6634        VfpReg::S8 => Ok(8),
6635        VfpReg::S9 => Ok(9),
6636        VfpReg::S10 => Ok(10),
6637        VfpReg::S11 => Ok(11),
6638        VfpReg::S12 => Ok(12),
6639        VfpReg::S13 => Ok(13),
6640        VfpReg::S14 => Ok(14),
6641        VfpReg::S15 => Ok(15),
6642        VfpReg::S16 => Ok(16),
6643        VfpReg::S17 => Ok(17),
6644        VfpReg::S18 => Ok(18),
6645        VfpReg::S19 => Ok(19),
6646        VfpReg::S20 => Ok(20),
6647        VfpReg::S21 => Ok(21),
6648        VfpReg::S22 => Ok(22),
6649        VfpReg::S23 => Ok(23),
6650        VfpReg::S24 => Ok(24),
6651        VfpReg::S25 => Ok(25),
6652        VfpReg::S26 => Ok(26),
6653        VfpReg::S27 => Ok(27),
6654        VfpReg::S28 => Ok(28),
6655        VfpReg::S29 => Ok(29),
6656        VfpReg::S30 => Ok(30),
6657        VfpReg::S31 => Ok(31),
6658        // D-registers are not used in F32 single-precision encodings
6659        _ => Err(synth_core::Error::SynthesisError(
6660            "D-register not supported in single-precision VFP encoding".to_string(),
6661        )),
6662    }
6663}
6664
6665/// D-register number: D0=0, D1=1, ..., D15=15
6666fn vfp_dreg_to_num(reg: &VfpReg) -> Result<u32> {
6667    match reg {
6668        VfpReg::D0 => Ok(0),
6669        VfpReg::D1 => Ok(1),
6670        VfpReg::D2 => Ok(2),
6671        VfpReg::D3 => Ok(3),
6672        VfpReg::D4 => Ok(4),
6673        VfpReg::D5 => Ok(5),
6674        VfpReg::D6 => Ok(6),
6675        VfpReg::D7 => Ok(7),
6676        VfpReg::D8 => Ok(8),
6677        VfpReg::D9 => Ok(9),
6678        VfpReg::D10 => Ok(10),
6679        VfpReg::D11 => Ok(11),
6680        VfpReg::D12 => Ok(12),
6681        VfpReg::D13 => Ok(13),
6682        VfpReg::D14 => Ok(14),
6683        VfpReg::D15 => Ok(15),
6684        // S-registers are not used in F64 double-precision encodings
6685        _ => Err(synth_core::Error::SynthesisError(
6686            "S-register not supported in double-precision VFP encoding".to_string(),
6687        )),
6688    }
6689}
6690
6691/// Split S-register into (Vx[3:0], qualifier_bit) for VFP encoding.
6692/// For an S-register number s: Vx = s >> 1, qualifier = s & 1.
6693/// The qualifier bit goes to D (bit 22), N (bit 7), or M (bit 5) depending on role.
6694fn encode_sreg(s: u32) -> (u32, u32) {
6695    (s >> 1, s & 1)
6696}
6697
6698/// Split D-register into (Vx[3:0], qualifier_bit) for VFP double-precision encoding.
6699/// For a D-register number d: Vx = d & 0xF, qualifier = (d >> 4) & 1.
6700/// For D0-D15, qualifier is always 0.
6701fn encode_dreg(d: u32) -> (u32, u32) {
6702    (d & 0xF, (d >> 4) & 1)
6703}
6704
6705/// Encode a VFP 3-register arithmetic instruction (VADD.F32, VSUB.F32, VMUL.F32, VDIV.F32).
6706/// Returns the full 32-bit instruction word.
6707///
6708/// VFP encoding: [cond 1110] [D opc1 Vn] [Vd 101 sz] [N opc2 M 0 Vm]
6709/// For single-precision (sz=0), coprocessor = 0xA (bits[11:8]).
6710fn encode_vfp_3reg(base: u32, sd: &VfpReg, sn: &VfpReg, sm: &VfpReg) -> Result<u32> {
6711    let sd_num = vfp_sreg_to_num(sd)?;
6712    let sn_num = vfp_sreg_to_num(sn)?;
6713    let sm_num = vfp_sreg_to_num(sm)?;
6714    let (vd, d) = encode_sreg(sd_num);
6715    let (vn, n) = encode_sreg(sn_num);
6716    let (vm, m) = encode_sreg(sm_num);
6717
6718    Ok(base | (d << 22) | (vn << 16) | (vd << 12) | (n << 7) | (m << 5) | vm)
6719}
6720
6721/// Encode a VFP 2-register instruction (VNEG.F32, VABS.F32, VSQRT.F32).
6722/// Returns the full 32-bit instruction word.
6723fn encode_vfp_2reg(base: u32, sd: &VfpReg, sm: &VfpReg) -> Result<u32> {
6724    let sd_num = vfp_sreg_to_num(sd)?;
6725    let sm_num = vfp_sreg_to_num(sm)?;
6726    let (vd, d) = encode_sreg(sd_num);
6727    let (vm, m) = encode_sreg(sm_num);
6728
6729    Ok(base | (d << 22) | (vd << 12) | (m << 5) | vm)
6730}
6731
6732/// Encode a VFP load/store (VLDR.F32 / VSTR.F32).
6733/// offset is in bytes and must be word-aligned; encoded as imm8 = offset/4.
6734/// U bit (bit 23) controls add/subtract offset.
6735fn encode_vfp_ldst(base: u32, sd: &VfpReg, addr: &MemAddr) -> Result<u32> {
6736    let sd_num = vfp_sreg_to_num(sd)?;
6737    let (vd, d) = encode_sreg(sd_num);
6738    let rn = reg_to_bits(&addr.base);
6739
6740    let offset = addr.offset;
6741    let u_bit = if offset >= 0 { 1u32 } else { 0u32 };
6742    let abs_offset = offset.unsigned_abs();
6743    let imm8 = (abs_offset / 4) & 0xFF;
6744
6745    Ok(base | (u_bit << 23) | (d << 22) | (rn << 16) | (vd << 12) | imm8)
6746}
6747
6748/// Encode VMOV between core register and S-register.
6749/// VMOV Sn, Rt: 0xEE00_0A10 | (Vn << 16) | (N << 7) | (Rt << 12)
6750/// VMOV Rt, Sn: 0xEE10_0A10 | (Vn << 16) | (N << 7) | (Rt << 12)
6751fn encode_vmov_core_sreg(to_sreg: bool, sreg: &VfpReg, core: &Reg) -> Result<u32> {
6752    let s_num = vfp_sreg_to_num(sreg)?;
6753    let (vn, n) = encode_sreg(s_num);
6754    let rt = reg_to_bits(core);
6755
6756    let base = if to_sreg { 0xEE000A10 } else { 0xEE100A10 };
6757    Ok(base | (vn << 16) | (rt << 12) | (n << 7))
6758}
6759
6760/// Encode a VFP 3-register double-precision instruction (VADD.F64, VSUB.F64, etc.).
6761/// For double-precision (sz=1), coprocessor = 0xB (bits[11:8]).
6762/// The base should have bit 8 = 1 for F64 (0xB suffix instead of 0xA).
6763fn encode_vfp_3reg_f64(base: u32, dd: &VfpReg, dn: &VfpReg, dm: &VfpReg) -> Result<u32> {
6764    let dd_num = vfp_dreg_to_num(dd)?;
6765    let dn_num = vfp_dreg_to_num(dn)?;
6766    let dm_num = vfp_dreg_to_num(dm)?;
6767    let (vd, d) = encode_dreg(dd_num);
6768    let (vn, n) = encode_dreg(dn_num);
6769    let (vm, m) = encode_dreg(dm_num);
6770
6771    Ok(base | (d << 22) | (vn << 16) | (vd << 12) | (n << 7) | (m << 5) | vm)
6772}
6773
6774/// Encode a VFP 2-register double-precision instruction (VNEG.F64, VABS.F64, VSQRT.F64).
6775fn encode_vfp_2reg_f64(base: u32, dd: &VfpReg, dm: &VfpReg) -> Result<u32> {
6776    let dd_num = vfp_dreg_to_num(dd)?;
6777    let dm_num = vfp_dreg_to_num(dm)?;
6778    let (vd, d) = encode_dreg(dd_num);
6779    let (vm, m) = encode_dreg(dm_num);
6780
6781    Ok(base | (d << 22) | (vd << 12) | (m << 5) | vm)
6782}
6783
6784/// Encode a VFP load/store for double-precision (VLDR.64 / VSTR.64).
6785/// offset is in bytes and must be word-aligned; encoded as imm8 = offset/4.
6786fn encode_vfp_ldst_f64(base: u32, dd: &VfpReg, addr: &MemAddr) -> Result<u32> {
6787    let dd_num = vfp_dreg_to_num(dd)?;
6788    let (vd, d) = encode_dreg(dd_num);
6789    let rn = reg_to_bits(&addr.base);
6790
6791    let offset = addr.offset;
6792    let u_bit = if offset >= 0 { 1u32 } else { 0u32 };
6793    let abs_offset = offset.unsigned_abs();
6794    let imm8 = (abs_offset / 4) & 0xFF;
6795
6796    Ok(base | (u_bit << 23) | (d << 22) | (rn << 16) | (vd << 12) | imm8)
6797}
6798
6799/// Encode VMOV between two core registers and a D-register.
6800/// VMOV Dm, Rt, Rt2: 0xEC40_0B10 | (Rt2 << 16) | (Rt << 12) | (M << 5) | Vm
6801/// VMOV Rt, Rt2, Dm: 0xEC50_0B10 | (Rt2 << 16) | (Rt << 12) | (M << 5) | Vm
6802fn encode_vmov_core_dreg(
6803    to_dreg: bool,
6804    dreg: &VfpReg,
6805    core_lo: &Reg,
6806    core_hi: &Reg,
6807) -> Result<u32> {
6808    let d_num = vfp_dreg_to_num(dreg)?;
6809    let (vm, m) = encode_dreg(d_num);
6810    let rt = reg_to_bits(core_lo);
6811    let rt2 = reg_to_bits(core_hi);
6812
6813    let base = if to_dreg { 0xEC400B10 } else { 0xEC500B10 };
6814    Ok(base | (rt2 << 16) | (rt << 12) | (m << 5) | vm)
6815}
6816
6817/// Emit a VFP 32-bit instruction as Thumb-2 bytes (two LE halfwords).
6818fn vfp_to_thumb_bytes(instr: u32) -> Vec<u8> {
6819    let hw1 = ((instr >> 16) & 0xFFFF) as u16;
6820    let hw2 = (instr & 0xFFFF) as u16;
6821    let mut bytes = hw1.to_le_bytes().to_vec();
6822    bytes.extend_from_slice(&hw2.to_le_bytes());
6823    bytes
6824}
6825
6826// ============================================================================
6827// Helium MVE encoding helpers
6828// ============================================================================
6829
6830/// Q-register number: Q0=0, Q1=1, ..., Q7=7
6831fn qreg_to_num(reg: &QReg) -> u32 {
6832    match reg {
6833        QReg::Q0 => 0,
6834        QReg::Q1 => 1,
6835        QReg::Q2 => 2,
6836        QReg::Q3 => 3,
6837        QReg::Q4 => 4,
6838        QReg::Q5 => 5,
6839        QReg::Q6 => 6,
6840        QReg::Q7 => 7,
6841    }
6842}
6843
6844/// MVE element size to encoding bits: S8=0b00, S16=0b01, S32=0b10
6845fn mve_size_bits(size: &MveSize) -> u32 {
6846    match size {
6847        MveSize::S8 => 0b00,
6848        MveSize::S16 => 0b01,
6849        MveSize::S32 => 0b10,
6850    }
6851}
6852
6853/// Encode MVE 3-register instruction.
6854/// Q-registers are encoded as D-register pairs: Q0=D0:D1, Q1=D2:D3, etc.
6855/// In NEON/MVE encoding, the Q-register uses D-register number = Qn * 2.
6856fn encode_mve_3reg(base: u32, qd: &QReg, qn: &QReg, qm: &QReg) -> u32 {
6857    let d = qreg_to_num(qd) * 2;
6858    let n = qreg_to_num(qn) * 2;
6859    let m = qreg_to_num(qm) * 2;
6860
6861    // Standard NEON/MVE 3-register encoding:
6862    // D bit (bit 22) = Vd[4], Vd[3:0] = bits [15:12]
6863    // N bit (bit 7)  = Vn[4], Vn[3:0] = bits [19:16]
6864    // M bit (bit 5)  = Vm[4], Vm[3:0] = bits [3:0]
6865    let vd = d & 0xF;
6866    let d_bit = (d >> 4) & 1;
6867    let vn = n & 0xF;
6868    let n_bit = (n >> 4) & 1;
6869    let vm = m & 0xF;
6870    let m_bit = (m >> 4) & 1;
6871
6872    base | (d_bit << 22) | (vn << 16) | (vd << 12) | (n_bit << 7) | (m_bit << 5) | vm
6873}
6874
6875/// Encode MVE 3-register bitwise instruction (VAND, VORR, VEOR, VBIC).
6876fn encode_mve_3reg_bitwise(base: u32, qd: &QReg, qn: &QReg, qm: &QReg) -> u32 {
6877    encode_mve_3reg(base, qd, qn, qm)
6878}
6879
6880/// Encode MVE VLDRW.32 Qd, [Rn, #offset]
6881/// Format: EC9x xxxx - contiguous load, word-sized elements
6882fn encode_mve_vldrw(qd: &QReg, addr: &MemAddr) -> u32 {
6883    let qd_enc = qreg_to_num(qd) * 2;
6884    let rn = reg_to_bits(&addr.base);
6885    let offset = addr.offset;
6886    let u_bit = if offset >= 0 { 1u32 } else { 0u32 };
6887    let abs_offset = offset.unsigned_abs();
6888    let imm7 = (abs_offset / 4) & 0x7F; // 7-bit word-aligned offset
6889
6890    // VLDRW.32 Qd, [Rn, #imm]: ED10 xx80 variant
6891    0xED100E80
6892        | (u_bit << 23)
6893        | ((qd_enc >> 4) << 22)
6894        | (rn << 16)
6895        | ((qd_enc & 0xF) << 12)
6896        | (imm7 & 0x7F)
6897}
6898
6899/// Encode MVE VSTRW.32 Qd, [Rn, #offset]
6900fn encode_mve_vstrw(qd: &QReg, addr: &MemAddr) -> u32 {
6901    let qd_enc = qreg_to_num(qd) * 2;
6902    let rn = reg_to_bits(&addr.base);
6903    let offset = addr.offset;
6904    let u_bit = if offset >= 0 { 1u32 } else { 0u32 };
6905    let abs_offset = offset.unsigned_abs();
6906    let imm7 = (abs_offset / 4) & 0x7F;
6907
6908    0xED000E80
6909        | (u_bit << 23)
6910        | ((qd_enc >> 4) << 22)
6911        | (rn << 16)
6912        | ((qd_enc & 0xF) << 12)
6913        | (imm7 & 0x7F)
6914}
6915
6916impl ArmEncoder {
6917    /// Encode MVE constant load: MOVW+MOVT+VMOV for each 32-bit word, then assemble Q-register
6918    fn encode_thumb_mve_const(&self, qd: &QReg, bytes: &[u8; 16]) -> Result<Vec<u8>> {
6919        let mut result = Vec::new();
6920        let qd_num = qreg_to_num(qd);
6921
6922        // Load each 32-bit word into R12 (temp) then VMOV into S-register
6923        for i in 0..4 {
6924            let word = u32::from_le_bytes([
6925                bytes[i * 4],
6926                bytes[i * 4 + 1],
6927                bytes[i * 4 + 2],
6928                bytes[i * 4 + 3],
6929            ]);
6930            let lo16 = word & 0xFFFF;
6931            let hi16 = (word >> 16) & 0xFFFF;
6932
6933            // MOVW R12, #lo16
6934            result.extend_from_slice(&self.encode_thumb32_movw_raw(12, lo16)?);
6935            // MOVT R12, #hi16
6936            if hi16 != 0 {
6937                result.extend_from_slice(&self.encode_thumb32_movt_raw(12, hi16)?);
6938            }
6939
6940            // VMOV Sn, R12 where Sn = Qd*4 + i
6941            let s_num = qd_num * 4 + i as u32;
6942            let (vn, n) = encode_sreg(s_num);
6943            let vmov: u32 = 0xEE000A10 | (vn << 16) | (12 << 12) | (n << 7);
6944            result.extend_from_slice(&vfp_to_thumb_bytes(vmov));
6945        }
6946
6947        Ok(result)
6948    }
6949
6950    /// Encode lane-wise f32 binary operation (VDIV, etc.) via S-register extraction
6951    fn encode_thumb_mve_lane_wise_f32_binop(
6952        &self,
6953        qd: &QReg,
6954        qn: &QReg,
6955        qm: &QReg,
6956        vfp_base: u32,
6957    ) -> Result<Vec<u8>> {
6958        let mut result = Vec::new();
6959        let qd_num = qreg_to_num(qd);
6960        let qn_num = qreg_to_num(qn);
6961        let qm_num = qreg_to_num(qm);
6962
6963        // For each lane 0..3: use S-registers directly (Q aliasing)
6964        for i in 0..4u32 {
6965            let sd = qd_num * 4 + i;
6966            let sn = qn_num * 4 + i;
6967            let sm = qm_num * 4 + i;
6968
6969            let (vd, d) = encode_sreg(sd);
6970            let (vn, n) = encode_sreg(sn);
6971            let (vm, m) = encode_sreg(sm);
6972
6973            let instr = vfp_base | (d << 22) | (vn << 16) | (vd << 12) | (n << 7) | (m << 5) | vm;
6974            result.extend_from_slice(&vfp_to_thumb_bytes(instr));
6975        }
6976
6977        Ok(result)
6978    }
6979
6980    /// Encode lane-wise f32 VSQRT via S-register extraction
6981    fn encode_thumb_mve_lane_wise_f32_sqrt(&self, qd: &QReg, qm: &QReg) -> Result<Vec<u8>> {
6982        let mut result = Vec::new();
6983        let qd_num = qreg_to_num(qd);
6984        let qm_num = qreg_to_num(qm);
6985
6986        // VSQRT.F32 base: 0xEEB10AC0
6987        for i in 0..4u32 {
6988            let sd = qd_num * 4 + i;
6989            let sm = qm_num * 4 + i;
6990
6991            let (vd, d) = encode_sreg(sd);
6992            let (vm, m) = encode_sreg(sm);
6993
6994            let instr: u32 = 0xEEB10AC0 | (d << 22) | (vd << 12) | (m << 5) | vm;
6995            result.extend_from_slice(&vfp_to_thumb_bytes(instr));
6996        }
6997
6998        Ok(result)
6999    }
7000}
7001
7002#[cfg(test)]
7003mod tests {
7004    use super::*;
7005
7006    #[test]
7007    fn test_encoder_creation() {
7008        let encoder_arm = ArmEncoder::new_arm32();
7009        assert!(!encoder_arm.thumb_mode);
7010
7011        let encoder_thumb = ArmEncoder::new_thumb2();
7012        assert!(encoder_thumb.thumb_mode);
7013    }
7014
7015    /// #204 WAKE-path regression: `SetCond` materialized 0/1 with the 16-bit
7016    /// `MOVS Rd,#imm` (T1), whose Rd field is 3 bits (R0–R7). For a high Rd
7017    /// (R8–R12) `rd_bits << 8` overflows bit 11, flipping the opcode MOVS→CMP
7018    /// (`0x2c00`), so the boolean was never written — gale's `has_waiter` kept a
7019    /// stale value and the binary-sem WAKE dispatch read garbage. High Rd must
7020    /// use the 32-bit `MOV.W` (T2). Verify the bytes, not the IR.
7021    #[test]
7022    fn test_encode_setcond_high_reg_uses_mov_w_204() {
7023        use synth_synthesis::{ArmOp, Condition, Reg};
7024        let enc = ArmEncoder::new_thumb2();
7025        // R12 (high): must be ITE + MOV.W #1 + MOV.W #0, never a 16-bit MOVS/CMP.
7026        let hi = enc
7027            .encode(&ArmOp::SetCond {
7028                rd: Reg::R12,
7029                cond: Condition::NE,
7030            })
7031            .unwrap();
7032        assert_eq!(hi.len(), 10, "ITE(2) + MOV.W(4) + MOV.W(4): {hi:02x?}");
7033        // both value halfwords are MOV.W (0xF04F) — NOT the corrupt CMP (0x2c..).
7034        assert_eq!(&hi[2..4], &[0x4F, 0xF0], "then = MOV.W: {hi:02x?}");
7035        assert_eq!(&hi[6..8], &[0x4F, 0xF0], "else = MOV.W: {hi:02x?}");
7036        assert_eq!(hi[4] & 0x0F, 0x01, "then imm = #1");
7037        assert_eq!(hi[8] & 0x0F, 0x00, "else imm = #0");
7038        // Low Rd keeps the compact 16-bit MOVS form.
7039        let lo = enc
7040            .encode(&ArmOp::SetCond {
7041                rd: Reg::R0,
7042                cond: Condition::NE,
7043            })
7044            .unwrap();
7045        assert_eq!(lo.len(), 6, "ITE(2) + MOVS(2) + MOVS(2): {lo:02x?}");
7046        assert_eq!(lo[2..4], [0x01, 0x20], "then = MOVS R0,#1");
7047        assert_eq!(lo[4..6], [0x00, 0x20], "else = MOVS R0,#0");
7048    }
7049
7050    /// #206 regression: the ARM32 (A32) `Ldr`/`Str` encoders fed `addr` through
7051    /// `encode_mem_addr`, which returns only the 12-bit immediate — so a register
7052    /// offset (`[rn, rm, #off]`) was silently dropped to `[rn, #off]`, sending
7053    /// the access to the wrong runtime address (silent miscompile on the default
7054    /// `--target arm`). A register offset must materialize `ip = rn + rm` and
7055    /// load from `[ip, #off]`. Verify the bytes.
7056    #[test]
7057    fn test_encode_arm32_indexed_load_keeps_index_206() {
7058        use synth_synthesis::{ArmOp, MemAddr, Reg};
7059        let enc = ArmEncoder::new_arm32();
7060        // ldr r0, [r11, r1, #8]  must NOT collapse to a single immediate ldr.
7061        let bytes = enc
7062            .encode(&ArmOp::Ldr {
7063                rd: Reg::R0,
7064                addr: MemAddr::reg_imm(Reg::R11, Reg::R1, 8),
7065            })
7066            .unwrap();
7067        assert_eq!(
7068            bytes.len(),
7069            8,
7070            "expected ADD ip + LDR (2 words): {bytes:02x?}"
7071        );
7072        let add = u32::from_le_bytes(bytes[0..4].try_into().unwrap());
7073        let ldr = u32::from_le_bytes(bytes[4..8].try_into().unwrap());
7074        // ADD ip, r11, r1  = 0xE08BC001
7075        assert_eq!(add, 0xE08B_C001, "ADD ip,r11,r1: {add:#010x}");
7076        // LDR r0, [ip, #8] = 0xE59C0008
7077        assert_eq!(ldr, 0xE59C_0008, "LDR r0,[ip,#8]: {ldr:#010x}");
7078        // A bare immediate ldr (the bug) would be 0xE59B0008 (base=r11) — reject.
7079        assert_ne!(ldr, 0xE59B_0008, "index must not be dropped");
7080    }
7081
7082    /// #178/#180 regression: the Thumb `Add`/`Adds`/`Subs` reg-forms used the
7083    /// 16-bit encoding unconditionally. For high registers (R12 base scratch,
7084    /// R8-R11 i64 pairs) the 3-bit register fields overflow and corrupt the
7085    /// operands — `add ip,ip,r0` came out as `adds r4,r5,r1` (0x186C), silently
7086    /// dropping the address operand and miscompiling every optimized memory
7087    /// access. High registers must use the 32-bit `.W` forms.
7088    #[test]
7089    fn test_encode_thumb_add_high_reg_uses_add_w_178_180() {
7090        let encoder = ArmEncoder::new_thumb2();
7091
7092        // add ip, ip, r0  — the exact MemLoad/MemStore base+addr op.
7093        let code = encoder
7094            .encode(&ArmOp::Add {
7095                rd: Reg::R12,
7096                rn: Reg::R12,
7097                op2: Operand2::Reg(Reg::R0),
7098            })
7099            .unwrap();
7100        // ADD.W ip, ip, r0 = EB0C 0C00 (little-endian halfwords).
7101        assert_eq!(
7102            code,
7103            vec![0x0C, 0xEB, 0x00, 0x0C],
7104            "high-reg Thumb ADD must be 32-bit ADD.W (EB0C 0C00), not corrupt 16-bit; got {code:02X?}"
7105        );
7106        // Must NOT be the buggy 16-bit 0x186C (`adds r4,r5,r1`).
7107        assert_ne!(code, vec![0x6C, 0x18], "regressed to corrupt 16-bit ADDS");
7108
7109        // Low-register add stays 16-bit (no regression for the common case).
7110        let lo = encoder
7111            .encode(&ArmOp::Add {
7112                rd: Reg::R1,
7113                rn: Reg::R2,
7114                op2: Operand2::Reg(Reg::R3),
7115            })
7116            .unwrap();
7117        assert_eq!(
7118            lo.len(),
7119            2,
7120            "low-reg ADD should remain 16-bit, got {lo:02X?}"
7121        );
7122    }
7123
7124    /// #178/#180 sibling: i64 low-word `Adds`/`Subs` can land in R8-R11 pairs;
7125    /// those must fall back to 32-bit ADDS.W/SUBS.W (flag-setting preserved).
7126    #[test]
7127    fn test_encode_thumb_adds_subs_high_reg_use_32bit_178_180() {
7128        let encoder = ArmEncoder::new_thumb2();
7129
7130        // adds r10, r10, r8  → ADDS.W = EB1A 0A08
7131        let adds = encoder
7132            .encode(&ArmOp::Adds {
7133                rd: Reg::R10,
7134                rn: Reg::R10,
7135                op2: Operand2::Reg(Reg::R8),
7136            })
7137            .unwrap();
7138        assert_eq!(
7139            adds,
7140            vec![0x1A, 0xEB, 0x08, 0x0A],
7141            "high-reg ADDS must be 32-bit ADDS.W (EB1A 0A08); got {adds:02X?}"
7142        );
7143
7144        // subs r10, r10, r8  → SUBS.W = EBBA 0A08
7145        let subs = encoder
7146            .encode(&ArmOp::Subs {
7147                rd: Reg::R10,
7148                rn: Reg::R10,
7149                op2: Operand2::Reg(Reg::R8),
7150            })
7151            .unwrap();
7152        assert_eq!(
7153            subs,
7154            vec![0xBA, 0xEB, 0x08, 0x0A],
7155            "high-reg SUBS must be 32-bit SUBS.W (EBBA 0A08); got {subs:02X?}"
7156        );
7157    }
7158
7159    /// #184 (sibling of #180): 16-bit CMN (T1) only encodes R0-R7. High registers
7160    /// must use 32-bit CMN.W, not the corrupt truncated 16-bit form.
7161    #[test]
7162    fn test_encode_thumb_cmn_high_reg_uses_cmn_w_184() {
7163        let encoder = ArmEncoder::new_thumb2();
7164
7165        // cmn r10, r8  → CMN.W = EB1A 0F08 (ADD.W S=1, Rd=PC discarded).
7166        let cmn = encoder
7167            .encode(&ArmOp::Cmn {
7168                rn: Reg::R10,
7169                op2: Operand2::Reg(Reg::R8),
7170            })
7171            .unwrap();
7172        assert_eq!(
7173            cmn,
7174            vec![0x1A, 0xEB, 0x08, 0x0F],
7175            "high-reg CMN must be 32-bit CMN.W (EB1A 0F08); got {cmn:02X?}"
7176        );
7177
7178        // Low registers stay 16-bit: cmn r1, r2 = 0x42D1.
7179        let lo = encoder
7180            .encode(&ArmOp::Cmn {
7181                rn: Reg::R1,
7182                op2: Operand2::Reg(Reg::R2),
7183            })
7184            .unwrap();
7185        assert_eq!(
7186            lo.len(),
7187            2,
7188            "low-reg CMN should remain 16-bit, got {lo:02X?}"
7189        );
7190        assert_eq!(lo, vec![0xD1, 0x42], "low-reg CMN bytes wrong: {lo:02X?}");
7191    }
7192
7193    /// #185 regression: feeding PC (R15) as a data operand to a Thumb-2 op that
7194    /// guards its registers must return Err, not panic under debug-assertions.
7195    /// (Synth never emits PC here; the fuzz harness requires encode() be total.)
7196    #[test]
7197    fn test_encode_pc_operand_returns_err_not_panic_185() {
7198        let encoder = ArmEncoder::new_thumb2();
7199        for op in [
7200            ArmOp::Sdiv {
7201                rd: Reg::PC,
7202                rn: Reg::R0,
7203                rm: Reg::R1,
7204            },
7205            ArmOp::Udiv {
7206                rd: Reg::R0,
7207                rn: Reg::PC,
7208                rm: Reg::R1,
7209            },
7210            ArmOp::Sdiv {
7211                rd: Reg::R0,
7212                rn: Reg::R1,
7213                rm: Reg::PC,
7214            },
7215        ] {
7216            let r = encoder.encode(&op);
7217            assert!(
7218                r.is_err(),
7219                "encode({op:?}) must return Err for a PC operand, got {r:?}"
7220            );
7221        }
7222        // Valid registers still encode fine (no false rejection).
7223        assert!(
7224            encoder
7225                .encode(&ArmOp::Sdiv {
7226                    rd: Reg::R0,
7227                    rn: Reg::R1,
7228                    rm: Reg::R2
7229                })
7230                .is_ok()
7231        );
7232    }
7233
7234    #[test]
7235    fn test_encode_nop_arm32() {
7236        let encoder = ArmEncoder::new_arm32();
7237        let code = encoder.encode(&ArmOp::Nop).unwrap();
7238
7239        assert_eq!(code.len(), 4); // ARM32 instructions are 4 bytes
7240        assert_eq!(code, vec![0x00, 0x00, 0xA0, 0xE1]); // MOV R0, R0
7241    }
7242
7243    #[test]
7244    fn test_encode_nop_thumb() {
7245        let encoder = ArmEncoder::new_thumb2();
7246        let code = encoder.encode(&ArmOp::Nop).unwrap();
7247
7248        assert_eq!(code.len(), 2); // Thumb instructions are 2 bytes
7249        assert_eq!(code, vec![0x00, 0xBF]); // NOP
7250    }
7251
7252    #[test]
7253    fn test_encode_mov_immediate_arm32() {
7254        let encoder = ArmEncoder::new_arm32();
7255        let op = ArmOp::Mov {
7256            rd: Reg::R0,
7257            op2: Operand2::Imm(42),
7258        };
7259
7260        let code = encoder.encode(&op).unwrap();
7261        assert_eq!(code.len(), 4);
7262
7263        // Verify it's a MOV instruction (bits should have immediate flag set)
7264        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7265        assert_eq!(instr & 0x0E000000, 0x02000000); // Check I bit is set
7266    }
7267
7268    #[test]
7269    fn test_encode_add_registers_arm32() {
7270        let encoder = ArmEncoder::new_arm32();
7271        let op = ArmOp::Add {
7272            rd: Reg::R0,
7273            rn: Reg::R1,
7274            op2: Operand2::Reg(Reg::R2),
7275        };
7276
7277        let code = encoder.encode(&op).unwrap();
7278        assert_eq!(code.len(), 4);
7279
7280        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7281        // Verify it's an ADD instruction with correct opcode
7282        assert_eq!(instr & 0x0FE00000, 0x00800000);
7283    }
7284
7285    #[test]
7286    fn test_encode_ldr_arm32() {
7287        let encoder = ArmEncoder::new_arm32();
7288        let op = ArmOp::Ldr {
7289            rd: Reg::R0,
7290            addr: MemAddr::imm(Reg::R1, 4),
7291        };
7292
7293        let code = encoder.encode(&op).unwrap();
7294        assert_eq!(code.len(), 4);
7295
7296        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7297        // Verify load bit is set
7298        assert_eq!(instr & 0x00100000, 0x00100000);
7299    }
7300
7301    #[test]
7302    fn test_encode_str_arm32() {
7303        let encoder = ArmEncoder::new_arm32();
7304        let op = ArmOp::Str {
7305            rd: Reg::R0,
7306            addr: MemAddr::imm(Reg::SP, 0),
7307        };
7308
7309        let code = encoder.encode(&op).unwrap();
7310        assert_eq!(code.len(), 4);
7311    }
7312
7313    #[test]
7314    fn test_encode_branch_arm32() {
7315        let encoder = ArmEncoder::new_arm32();
7316        let op = ArmOp::Bl {
7317            label: "main".to_string(),
7318        };
7319
7320        let code = encoder.encode(&op).unwrap();
7321        assert_eq!(code.len(), 4);
7322
7323        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7324        // Verify BL opcode
7325        assert_eq!(instr & 0x0F000000, 0x0B000000);
7326    }
7327
7328    /// Regression test for #167 + #174: the Thumb-2 BL relocatable placeholder
7329    /// must carry a -4 addend so an R_ARM_THM_CALL nets to exactly the symbol S.
7330    /// The correct encoding is what `gas` emits for `bl <extern>`: f7ff fffe
7331    /// (hw1=0xF7FF, hw2=0xFFFE), little-endian bytes FF F7 FE FF.
7332    ///   - 0xD000 (J1=J2=0) → ~+0x600000 garbage addend: `bl c0000c` / truncated
7333    ///     to fit (#167).
7334    ///   - 0xF800 (addend 0) → lands at S+4, one instruction past the callee
7335    ///     entry (#174).
7336    ///   - 0xFFFE (addend -4) → lands at S. Correct.
7337    #[test]
7338    fn test_encode_thumb_bl_placeholder_addend_167_174() {
7339        let encoder = ArmEncoder::new_thumb2();
7340        let op = ArmOp::Bl {
7341            label: "callee".to_string(),
7342        };
7343
7344        let code = encoder.encode(&op).unwrap();
7345        assert_eq!(code.len(), 4, "Thumb-2 BL is 32-bit");
7346
7347        let hw1 = u16::from_le_bytes([code[0], code[1]]);
7348        let hw2 = u16::from_le_bytes([code[2], code[3]]);
7349        assert_eq!(hw1, 0xF7FF, "BL first halfword (matches gas `bl <extern>`)");
7350        assert_eq!(
7351            hw2, 0xFFFE,
7352            "BL second halfword must be 0xFFFE (-4 addend → nets to S), not 0xF800 (→ S+4, #174) or 0xD000 (#167)"
7353        );
7354        assert_ne!(hw2, 0xF800, "0xF800 (addend 0) lands at S+4 (#174)");
7355        assert_ne!(hw2, 0xD000, "0xD000 bakes in a ~+0x600000 addend (#167)");
7356    }
7357
7358    #[test]
7359    fn test_encode_sequence() {
7360        let encoder = ArmEncoder::new_arm32();
7361        let ops = vec![
7362            ArmOp::Mov {
7363                rd: Reg::R0,
7364                op2: Operand2::Imm(42),
7365            },
7366            ArmOp::Mov {
7367                rd: Reg::R1,
7368                op2: Operand2::Imm(10),
7369            },
7370            ArmOp::Add {
7371                rd: Reg::R2,
7372                rn: Reg::R0,
7373                op2: Operand2::Reg(Reg::R1),
7374            },
7375        ];
7376
7377        let code = encoder.encode_sequence(&ops).unwrap();
7378        assert_eq!(code.len(), 12); // 3 instructions * 4 bytes
7379    }
7380
7381    #[test]
7382    fn test_reg_to_bits() {
7383        assert_eq!(reg_to_bits(&Reg::R0), 0);
7384        assert_eq!(reg_to_bits(&Reg::R7), 7);
7385        assert_eq!(reg_to_bits(&Reg::SP), 13);
7386        assert_eq!(reg_to_bits(&Reg::LR), 14);
7387        assert_eq!(reg_to_bits(&Reg::PC), 15);
7388    }
7389
7390    #[test]
7391    fn test_encode_bitwise_operations() {
7392        let encoder = ArmEncoder::new_arm32();
7393
7394        let and_op = ArmOp::And {
7395            rd: Reg::R0,
7396            rn: Reg::R1,
7397            op2: Operand2::Reg(Reg::R2),
7398        };
7399        let and_code = encoder.encode(&and_op).unwrap();
7400        assert_eq!(and_code.len(), 4);
7401
7402        let orr_op = ArmOp::Orr {
7403            rd: Reg::R0,
7404            rn: Reg::R1,
7405            op2: Operand2::Reg(Reg::R2),
7406        };
7407        let orr_code = encoder.encode(&orr_op).unwrap();
7408        assert_eq!(orr_code.len(), 4);
7409
7410        let eor_op = ArmOp::Eor {
7411            rd: Reg::R0,
7412            rn: Reg::R1,
7413            op2: Operand2::Reg(Reg::R2),
7414        };
7415        let eor_code = encoder.encode(&eor_op).unwrap();
7416        assert_eq!(eor_code.len(), 4);
7417    }
7418
7419    // === Thumb-2 32-bit encoding tests ===
7420
7421    #[test]
7422    fn test_encode_sdiv_thumb2() {
7423        let encoder = ArmEncoder::new_thumb2();
7424        let op = ArmOp::Sdiv {
7425            rd: Reg::R0,
7426            rn: Reg::R1,
7427            rm: Reg::R2,
7428        };
7429
7430        let code = encoder.encode(&op).unwrap();
7431        assert_eq!(code.len(), 4); // 32-bit Thumb-2 instruction
7432
7433        // SDIV R0, R1, R2: 0xFB91 0xF0F2
7434        // First halfword: 0xFB90 | Rn(1) = 0xFB91
7435        // Second halfword: 0xF0F0 | Rd(0)<<8 | Rm(2) = 0xF0F2
7436        // Little-endian: [0x91, 0xFB, 0xF2, 0xF0]
7437        assert_eq!(code[0], 0x91);
7438        assert_eq!(code[1], 0xFB);
7439        assert_eq!(code[2], 0xF2);
7440        assert_eq!(code[3], 0xF0);
7441    }
7442
7443    #[test]
7444    fn test_encode_udiv_thumb2() {
7445        let encoder = ArmEncoder::new_thumb2();
7446        let op = ArmOp::Udiv {
7447            rd: Reg::R0,
7448            rn: Reg::R1,
7449            rm: Reg::R2,
7450        };
7451
7452        let code = encoder.encode(&op).unwrap();
7453        assert_eq!(code.len(), 4); // 32-bit Thumb-2 instruction
7454
7455        // UDIV R0, R1, R2: 0xFBB1 0xF0F2
7456        // Little-endian: [0xB1, 0xFB, 0xF2, 0xF0]
7457        assert_eq!(code[0], 0xB1);
7458        assert_eq!(code[1], 0xFB);
7459        assert_eq!(code[2], 0xF2);
7460        assert_eq!(code[3], 0xF0);
7461    }
7462
7463    #[test]
7464    fn test_encode_mul_thumb2() {
7465        let encoder = ArmEncoder::new_thumb2();
7466        let op = ArmOp::Mul {
7467            rd: Reg::R0,
7468            rn: Reg::R1,
7469            rm: Reg::R2,
7470        };
7471
7472        let code = encoder.encode(&op).unwrap();
7473        assert_eq!(code.len(), 4); // 32-bit Thumb-2 instruction
7474    }
7475
7476    #[test]
7477    fn test_encode_and_thumb2() {
7478        let encoder = ArmEncoder::new_thumb2();
7479        let op = ArmOp::And {
7480            rd: Reg::R0,
7481            rn: Reg::R1,
7482            op2: Operand2::Reg(Reg::R2),
7483        };
7484
7485        let code = encoder.encode(&op).unwrap();
7486        assert_eq!(code.len(), 4); // 32-bit Thumb-2 instruction
7487    }
7488
7489    #[test]
7490    fn test_encode_lsl_thumb2_low_regs() {
7491        let encoder = ArmEncoder::new_thumb2();
7492        let op = ArmOp::Lsl {
7493            rd: Reg::R0,
7494            rn: Reg::R1,
7495            shift: 5,
7496        };
7497
7498        let code = encoder.encode(&op).unwrap();
7499        assert_eq!(code.len(), 2); // 16-bit for low registers
7500    }
7501
7502    #[test]
7503    fn test_encode_clz_thumb2() {
7504        let encoder = ArmEncoder::new_thumb2();
7505        let op = ArmOp::Clz {
7506            rd: Reg::R0,
7507            rm: Reg::R1,
7508        };
7509
7510        let code = encoder.encode(&op).unwrap();
7511        assert_eq!(code.len(), 4); // 32-bit Thumb-2 instruction
7512    }
7513
7514    #[test]
7515    fn test_encode_bx_thumb2() {
7516        let encoder = ArmEncoder::new_thumb2();
7517        let op = ArmOp::Bx { rm: Reg::LR };
7518
7519        let code = encoder.encode(&op).unwrap();
7520        assert_eq!(code.len(), 2); // 16-bit instruction
7521
7522        // BX LR: 0x4770
7523        assert_eq!(code, vec![0x70, 0x47]);
7524    }
7525
7526    // ========================================================================
7527    // f32 pseudo-op encoding tests
7528    // ========================================================================
7529
7530    #[test]
7531    fn test_encode_f32_abs_arm32() {
7532        let encoder = ArmEncoder::new_arm32();
7533        let op = ArmOp::F32Abs {
7534            sd: VfpReg::S0,
7535            sm: VfpReg::S2,
7536        };
7537        let code = encoder.encode(&op).unwrap();
7538        assert_eq!(code.len(), 4); // Single VFP instruction
7539    }
7540
7541    #[test]
7542    fn test_encode_f32_neg_arm32() {
7543        let encoder = ArmEncoder::new_arm32();
7544        let op = ArmOp::F32Neg {
7545            sd: VfpReg::S0,
7546            sm: VfpReg::S2,
7547        };
7548        let code = encoder.encode(&op).unwrap();
7549        assert_eq!(code.len(), 4);
7550    }
7551
7552    #[test]
7553    fn test_encode_f32_sqrt_arm32() {
7554        let encoder = ArmEncoder::new_arm32();
7555        let op = ArmOp::F32Sqrt {
7556            sd: VfpReg::S0,
7557            sm: VfpReg::S2,
7558        };
7559        let code = encoder.encode(&op).unwrap();
7560        assert_eq!(code.len(), 4);
7561    }
7562
7563    #[test]
7564    fn test_encode_f32_ceil_arm32() {
7565        let encoder = ArmEncoder::new_arm32();
7566        let op = ArmOp::F32Ceil {
7567            sd: VfpReg::S0,
7568            sm: VfpReg::S2,
7569        };
7570        let code = encoder.encode(&op).unwrap();
7571        // VMRS + BIC + ORR + VMSR + VCVT.S32.F32 + VMRS + BIC + VMSR + VCVT.F32.S32
7572        assert_eq!(code.len(), 36);
7573    }
7574
7575    #[test]
7576    fn test_encode_f32_floor_thumb2() {
7577        let encoder = ArmEncoder::new_thumb2();
7578        let op = ArmOp::F32Floor {
7579            sd: VfpReg::S0,
7580            sm: VfpReg::S2,
7581        };
7582        let code = encoder.encode(&op).unwrap();
7583        // VMRS + BIC.W + ORR.W + VMSR + VCVT + VMRS + BIC.W + VMSR + VCVT.F32.S32
7584        assert_eq!(code.len(), 36);
7585    }
7586
7587    #[test]
7588    fn test_encode_f32_min_arm32() {
7589        let encoder = ArmEncoder::new_arm32();
7590        let op = ArmOp::F32Min {
7591            sd: VfpReg::S0,
7592            sn: VfpReg::S2,
7593            sm: VfpReg::S4,
7594        };
7595        let code = encoder.encode(&op).unwrap();
7596        assert_eq!(code.len(), 16); // VMOV + VCMP + VMRS + conditional VMOV
7597    }
7598
7599    #[test]
7600    fn test_encode_f32_max_thumb2() {
7601        let encoder = ArmEncoder::new_thumb2();
7602        let op = ArmOp::F32Max {
7603            sd: VfpReg::S0,
7604            sn: VfpReg::S2,
7605            sm: VfpReg::S4,
7606        };
7607        let code = encoder.encode(&op).unwrap();
7608        // VMOV(4) + VCMP(4) + VMRS(4) + IT(2) + VMOV(4) = 18
7609        assert_eq!(code.len(), 18);
7610    }
7611
7612    #[test]
7613    fn test_encode_f32_copysign_arm32() {
7614        let encoder = ArmEncoder::new_arm32();
7615        let op = ArmOp::F32Copysign {
7616            sd: VfpReg::S0,
7617            sn: VfpReg::S2,
7618            sm: VfpReg::S4,
7619        };
7620        let code = encoder.encode(&op).unwrap();
7621        // VMOV + VMOV + AND + BIC + ORR + VMOV = 6 * 4 = 24
7622        assert_eq!(code.len(), 24);
7623    }
7624
7625    // ========================================================================
7626    // f64 encoding tests
7627    // ========================================================================
7628
7629    #[test]
7630    fn test_encode_f64_add_arm32() {
7631        let encoder = ArmEncoder::new_arm32();
7632        let op = ArmOp::F64Add {
7633            dd: VfpReg::D0,
7634            dn: VfpReg::D1,
7635            dm: VfpReg::D2,
7636        };
7637        let code = encoder.encode(&op).unwrap();
7638        assert_eq!(code.len(), 4);
7639        // VADD.F64 D0, D1, D2: check coprocessor is cp11 (0xB)
7640        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7641        assert_eq!((instr >> 8) & 0xF, 0xB); // cp11
7642    }
7643
7644    #[test]
7645    fn test_encode_f64_sub_thumb2() {
7646        let encoder = ArmEncoder::new_thumb2();
7647        let op = ArmOp::F64Sub {
7648            dd: VfpReg::D0,
7649            dn: VfpReg::D1,
7650            dm: VfpReg::D2,
7651        };
7652        let code = encoder.encode(&op).unwrap();
7653        assert_eq!(code.len(), 4); // 32-bit VFP as two Thumb halfwords
7654    }
7655
7656    #[test]
7657    fn test_encode_f64_mul_arm32() {
7658        let encoder = ArmEncoder::new_arm32();
7659        let op = ArmOp::F64Mul {
7660            dd: VfpReg::D0,
7661            dn: VfpReg::D1,
7662            dm: VfpReg::D2,
7663        };
7664        let code = encoder.encode(&op).unwrap();
7665        assert_eq!(code.len(), 4);
7666    }
7667
7668    #[test]
7669    fn test_encode_f64_div_arm32() {
7670        let encoder = ArmEncoder::new_arm32();
7671        let op = ArmOp::F64Div {
7672            dd: VfpReg::D0,
7673            dn: VfpReg::D1,
7674            dm: VfpReg::D2,
7675        };
7676        let code = encoder.encode(&op).unwrap();
7677        assert_eq!(code.len(), 4);
7678    }
7679
7680    #[test]
7681    fn test_encode_f64_abs_arm32() {
7682        let encoder = ArmEncoder::new_arm32();
7683        let op = ArmOp::F64Abs {
7684            dd: VfpReg::D0,
7685            dm: VfpReg::D2,
7686        };
7687        let code = encoder.encode(&op).unwrap();
7688        assert_eq!(code.len(), 4);
7689    }
7690
7691    #[test]
7692    fn test_encode_f64_neg_arm32() {
7693        let encoder = ArmEncoder::new_arm32();
7694        let op = ArmOp::F64Neg {
7695            dd: VfpReg::D0,
7696            dm: VfpReg::D2,
7697        };
7698        let code = encoder.encode(&op).unwrap();
7699        assert_eq!(code.len(), 4);
7700    }
7701
7702    #[test]
7703    fn test_encode_f64_sqrt_arm32() {
7704        let encoder = ArmEncoder::new_arm32();
7705        let op = ArmOp::F64Sqrt {
7706            dd: VfpReg::D0,
7707            dm: VfpReg::D2,
7708        };
7709        let code = encoder.encode(&op).unwrap();
7710        assert_eq!(code.len(), 4);
7711    }
7712
7713    #[test]
7714    fn test_encode_f64_load_arm32() {
7715        let encoder = ArmEncoder::new_arm32();
7716        let op = ArmOp::F64Load {
7717            dd: VfpReg::D0,
7718            addr: MemAddr::imm(Reg::R0, 8),
7719        };
7720        let code = encoder.encode(&op).unwrap();
7721        assert_eq!(code.len(), 4);
7722        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7723        assert_eq!((instr >> 8) & 0xF, 0xB); // cp11 for F64
7724        assert_eq!(instr & 0xFF, 2); // offset 8 / 4 = 2
7725    }
7726
7727    #[test]
7728    fn test_encode_f64_store_thumb2() {
7729        let encoder = ArmEncoder::new_thumb2();
7730        let op = ArmOp::F64Store {
7731            dd: VfpReg::D0,
7732            addr: MemAddr::imm(Reg::SP, 0),
7733        };
7734        let code = encoder.encode(&op).unwrap();
7735        assert_eq!(code.len(), 4);
7736    }
7737
7738    #[test]
7739    fn test_encode_f64_compare_arm32() {
7740        let encoder = ArmEncoder::new_arm32();
7741        let op = ArmOp::F64Eq {
7742            rd: Reg::R0,
7743            dn: VfpReg::D0,
7744            dm: VfpReg::D1,
7745        };
7746        let code = encoder.encode(&op).unwrap();
7747        assert_eq!(code.len(), 16); // VCMP + VMRS + MOV #0 + MOVcond #1
7748    }
7749
7750    #[test]
7751    fn test_encode_f64_compare_thumb2() {
7752        let encoder = ArmEncoder::new_thumb2();
7753        let op = ArmOp::F64Lt {
7754            rd: Reg::R0,
7755            dn: VfpReg::D0,
7756            dm: VfpReg::D1,
7757        };
7758        let code = encoder.encode(&op).unwrap();
7759        // VCMP(4) + VMRS(4) + MOVS(2) + IT(2) + MOV(2) = 14
7760        assert_eq!(code.len(), 14);
7761    }
7762
7763    #[test]
7764    fn test_encode_f64_const_arm32() {
7765        let encoder = ArmEncoder::new_arm32();
7766        let op = ArmOp::F64Const {
7767            dd: VfpReg::D0,
7768            value: 3.125,
7769        };
7770        let code = encoder.encode(&op).unwrap();
7771        // MOVW(4) + MOVT(4) + MOVW(4) + MOVT(4) + VMOV(4) = 20
7772        assert_eq!(code.len(), 20);
7773    }
7774
7775    #[test]
7776    fn test_encode_f64_const_thumb2() {
7777        let encoder = ArmEncoder::new_thumb2();
7778        let op = ArmOp::F64Const {
7779            dd: VfpReg::D0,
7780            value: 2.5,
7781        };
7782        let code = encoder.encode(&op).unwrap();
7783        // MOVW(4) + MOVT(4) + MOVW(4) + MOVT(4) + VMOV(4) = 20
7784        assert_eq!(code.len(), 20);
7785    }
7786
7787    #[test]
7788    fn test_encode_f64_convert_i32s_arm32() {
7789        let encoder = ArmEncoder::new_arm32();
7790        let op = ArmOp::F64ConvertI32S {
7791            dd: VfpReg::D0,
7792            rm: Reg::R0,
7793        };
7794        let code = encoder.encode(&op).unwrap();
7795        // VMOV(4) + VCVT(4) = 8
7796        assert_eq!(code.len(), 8);
7797    }
7798
7799    #[test]
7800    fn test_encode_f64_promote_f32_arm32() {
7801        let encoder = ArmEncoder::new_arm32();
7802        let op = ArmOp::F64PromoteF32 {
7803            dd: VfpReg::D0,
7804            sm: VfpReg::S0,
7805        };
7806        let code = encoder.encode(&op).unwrap();
7807        assert_eq!(code.len(), 4); // Single VCVT.F64.F32 instruction
7808    }
7809
7810    #[test]
7811    fn test_encode_f64_promote_f32_thumb2() {
7812        let encoder = ArmEncoder::new_thumb2();
7813        let op = ArmOp::F64PromoteF32 {
7814            dd: VfpReg::D0,
7815            sm: VfpReg::S0,
7816        };
7817        let code = encoder.encode(&op).unwrap();
7818        assert_eq!(code.len(), 4);
7819    }
7820
7821    #[test]
7822    fn test_encode_i32_trunc_f64s_arm32() {
7823        let encoder = ArmEncoder::new_arm32();
7824        let op = ArmOp::I32TruncF64S {
7825            rd: Reg::R0,
7826            dm: VfpReg::D0,
7827        };
7828        let code = encoder.encode(&op).unwrap();
7829        // VCVT(4) + VMOV(4) = 8
7830        assert_eq!(code.len(), 8);
7831    }
7832
7833    #[test]
7834    fn test_encode_f64_reinterpret_i64_arm32() {
7835        let encoder = ArmEncoder::new_arm32();
7836        let op = ArmOp::F64ReinterpretI64 {
7837            dd: VfpReg::D0,
7838            rmlo: Reg::R0,
7839            rmhi: Reg::R1,
7840        };
7841        let code = encoder.encode(&op).unwrap();
7842        assert_eq!(code.len(), 4); // Single VMOV instruction
7843    }
7844
7845    #[test]
7846    fn test_encode_i64_reinterpret_f64_thumb2() {
7847        let encoder = ArmEncoder::new_thumb2();
7848        let op = ArmOp::I64ReinterpretF64 {
7849            rdlo: Reg::R0,
7850            rdhi: Reg::R1,
7851            dm: VfpReg::D0,
7852        };
7853        let code = encoder.encode(&op).unwrap();
7854        assert_eq!(code.len(), 4);
7855    }
7856
7857    #[test]
7858    fn test_encode_f64_trunc_thumb2() {
7859        let encoder = ArmEncoder::new_thumb2();
7860        let op = ArmOp::F64Trunc {
7861            dd: VfpReg::D0,
7862            dm: VfpReg::D1,
7863        };
7864        let code = encoder.encode(&op).unwrap();
7865        // Two VFP instructions via Thumb encoding
7866        assert_eq!(code.len(), 8);
7867    }
7868
7869    #[test]
7870    fn test_encode_f64_min_arm32() {
7871        let encoder = ArmEncoder::new_arm32();
7872        let op = ArmOp::F64Min {
7873            dd: VfpReg::D0,
7874            dn: VfpReg::D1,
7875            dm: VfpReg::D2,
7876        };
7877        let code = encoder.encode(&op).unwrap();
7878        // VMOV + VCMP + VMRS + conditional VMOV = 16
7879        assert_eq!(code.len(), 16);
7880    }
7881
7882    #[test]
7883    fn test_f64_cp11_encoding() {
7884        // Verify that F64 instructions use coprocessor 11 (0xB), not 10 (0xA)
7885        let encoder = ArmEncoder::new_arm32();
7886
7887        // F64Add
7888        let code = encoder
7889            .encode(&ArmOp::F64Add {
7890                dd: VfpReg::D0,
7891                dn: VfpReg::D0,
7892                dm: VfpReg::D0,
7893            })
7894            .unwrap();
7895        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7896        assert_eq!((instr >> 8) & 0xF, 0xB, "F64 should use cp11");
7897
7898        // F32Add for comparison
7899        let code = encoder
7900            .encode(&ArmOp::F32Add {
7901                sd: VfpReg::S0,
7902                sn: VfpReg::S0,
7903                sm: VfpReg::S0,
7904            })
7905            .unwrap();
7906        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7907        assert_eq!((instr >> 8) & 0xF, 0xA, "F32 should use cp10");
7908    }
7909
7910    #[test]
7911    fn test_dreg_encoding_higher_registers() {
7912        let encoder = ArmEncoder::new_arm32();
7913
7914        // Test with D15 (highest register)
7915        let op = ArmOp::F64Add {
7916            dd: VfpReg::D15,
7917            dn: VfpReg::D14,
7918            dm: VfpReg::D13,
7919        };
7920        let code = encoder.encode(&op).unwrap();
7921        assert_eq!(code.len(), 4);
7922
7923        // Verify the register encoding worked (instruction is valid)
7924        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7925        assert_eq!((instr >> 8) & 0xF, 0xB); // cp11
7926    }
7927
7928    // ========================================================================
7929    // Control flow encoding tests
7930    // ========================================================================
7931
7932    #[test]
7933    fn test_encode_label_emits_no_bytes() {
7934        let encoder = ArmEncoder::new_thumb2();
7935        let op = ArmOp::Label {
7936            name: ".Lblock_end_0".to_string(),
7937        };
7938        let code = encoder.encode(&op).unwrap();
7939        assert!(code.is_empty(), "Label should emit zero bytes");
7940
7941        let encoder32 = ArmEncoder::new_arm32();
7942        let code32 = encoder32.encode(&op).unwrap();
7943        assert!(
7944            code32.is_empty(),
7945            "Label should emit zero bytes in ARM32 too"
7946        );
7947    }
7948
7949    #[test]
7950    fn test_encode_bcc_eq_thumb2() {
7951        use synth_synthesis::Condition;
7952        let encoder = ArmEncoder::new_thumb2();
7953        let op = ArmOp::Bcc {
7954            cond: Condition::EQ,
7955            label: "target".to_string(),
7956        };
7957        let code = encoder.encode(&op).unwrap();
7958        assert_eq!(code.len(), 2); // 16-bit conditional branch
7959
7960        // BEQ with offset 0: 0xD000 in little-endian
7961        assert_eq!(code, vec![0x00, 0xD0]);
7962    }
7963
7964    #[test]
7965    fn test_encode_bcc_ne_thumb2() {
7966        use synth_synthesis::Condition;
7967        let encoder = ArmEncoder::new_thumb2();
7968        let op = ArmOp::Bcc {
7969            cond: Condition::NE,
7970            label: "target".to_string(),
7971        };
7972        let code = encoder.encode(&op).unwrap();
7973        assert_eq!(code.len(), 2);
7974
7975        // BNE with offset 0: 0xD100 in little-endian
7976        assert_eq!(code, vec![0x00, 0xD1]);
7977    }
7978
7979    #[test]
7980    fn test_encode_bcc_arm32() {
7981        use synth_synthesis::Condition;
7982        let encoder = ArmEncoder::new_arm32();
7983        let op = ArmOp::Bcc {
7984            cond: Condition::EQ,
7985            label: "target".to_string(),
7986        };
7987        let code = encoder.encode(&op).unwrap();
7988        assert_eq!(code.len(), 4); // 32-bit ARM instruction
7989
7990        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7991        // BEQ: cond=0x0, opcode=0xA, offset=0
7992        assert_eq!(instr & 0xF0000000, 0x00000000); // EQ condition
7993        assert_eq!(instr & 0x0F000000, 0x0A000000); // Branch opcode
7994    }
7995
7996    #[test]
7997    fn test_encode_udf_thumb2() {
7998        let encoder = ArmEncoder::new_thumb2();
7999        let op = ArmOp::Udf { imm: 0 };
8000        let code = encoder.encode(&op).unwrap();
8001        assert_eq!(code.len(), 2); // 16-bit
8002
8003        // UDF #0: 0xDE00 in little-endian
8004        assert_eq!(code, vec![0x00, 0xDE]);
8005    }
8006
8007    #[test]
8008    fn test_encode_nop_thumb2() {
8009        let encoder = ArmEncoder::new_thumb2();
8010        let op = ArmOp::Nop;
8011        let code = encoder.encode(&op).unwrap();
8012        assert_eq!(code.len(), 2); // 16-bit
8013
8014        // NOP: 0xBF00 in little-endian
8015        assert_eq!(code, vec![0x00, 0xBF]);
8016    }
8017
8018    // =========================================================================
8019    // i64 Thumb-2 encoding tests
8020    // =========================================================================
8021
8022    #[test]
8023    fn test_encode_i64_add_thumb2() {
8024        let encoder = ArmEncoder::new_thumb2();
8025        let op = ArmOp::I64Add {
8026            rdlo: Reg::R0,
8027            rdhi: Reg::R1,
8028            rnlo: Reg::R0,
8029            rnhi: Reg::R1,
8030            rmlo: Reg::R2,
8031            rmhi: Reg::R3,
8032        };
8033        let code = encoder.encode(&op).unwrap();
8034        // Should emit ADDS (2 bytes) + ADC.W (4 bytes) = 6 bytes
8035        assert_eq!(code.len(), 6, "I64Add should be 6 bytes (ADDS + ADC.W)");
8036    }
8037
8038    #[test]
8039    fn test_encode_i64_sub_thumb2() {
8040        let encoder = ArmEncoder::new_thumb2();
8041        let op = ArmOp::I64Sub {
8042            rdlo: Reg::R0,
8043            rdhi: Reg::R1,
8044            rnlo: Reg::R0,
8045            rnhi: Reg::R1,
8046            rmlo: Reg::R2,
8047            rmhi: Reg::R3,
8048        };
8049        let code = encoder.encode(&op).unwrap();
8050        // Should emit SUBS (2 bytes) + SBC.W (4 bytes) = 6 bytes
8051        assert_eq!(code.len(), 6, "I64Sub should be 6 bytes (SUBS + SBC.W)");
8052    }
8053
8054    #[test]
8055    fn test_encode_i64_and_thumb2() {
8056        let encoder = ArmEncoder::new_thumb2();
8057        let op = ArmOp::I64And {
8058            rdlo: Reg::R0,
8059            rdhi: Reg::R1,
8060            rnlo: Reg::R0,
8061            rnhi: Reg::R1,
8062            rmlo: Reg::R2,
8063            rmhi: Reg::R3,
8064        };
8065        let code = encoder.encode(&op).unwrap();
8066        // AND.W (4 bytes) + AND.W (4 bytes) = 8 bytes
8067        assert!(code.len() >= 4, "I64And should emit at least 4 bytes");
8068    }
8069
8070    #[test]
8071    fn test_encode_i64_or_thumb2() {
8072        let encoder = ArmEncoder::new_thumb2();
8073        let op = ArmOp::I64Or {
8074            rdlo: Reg::R0,
8075            rdhi: Reg::R1,
8076            rnlo: Reg::R0,
8077            rnhi: Reg::R1,
8078            rmlo: Reg::R2,
8079            rmhi: Reg::R3,
8080        };
8081        let code = encoder.encode(&op).unwrap();
8082        assert!(code.len() >= 4, "I64Or should emit at least 4 bytes");
8083    }
8084
8085    #[test]
8086    fn test_encode_i64_xor_thumb2() {
8087        let encoder = ArmEncoder::new_thumb2();
8088        let op = ArmOp::I64Xor {
8089            rdlo: Reg::R0,
8090            rdhi: Reg::R1,
8091            rnlo: Reg::R0,
8092            rnhi: Reg::R1,
8093            rmlo: Reg::R2,
8094            rmhi: Reg::R3,
8095        };
8096        let code = encoder.encode(&op).unwrap();
8097        assert!(code.len() >= 4, "I64Xor should emit at least 4 bytes");
8098    }
8099
8100    #[test]
8101    fn test_encode_i64_const_small_thumb2() {
8102        let encoder = ArmEncoder::new_thumb2();
8103        // Small constant: only needs MOVW for each half
8104        let op = ArmOp::I64Const {
8105            rdlo: Reg::R0,
8106            rdhi: Reg::R1,
8107            value: 42,
8108        };
8109        let code = encoder.encode(&op).unwrap();
8110        // MOVW R0, #42 (4 bytes) + MOVW R1, #0 (4 bytes) = 8 bytes minimum
8111        assert!(code.len() >= 8, "I64Const should emit at least 8 bytes");
8112    }
8113
8114    #[test]
8115    fn test_encode_i64_const_large_thumb2() {
8116        let encoder = ArmEncoder::new_thumb2();
8117        // Large constant: needs MOVW+MOVT for each half
8118        let op = ArmOp::I64Const {
8119            rdlo: Reg::R0,
8120            rdhi: Reg::R1,
8121            value: 0x1234_5678_9ABC_DEF0_u64 as i64,
8122        };
8123        let code = encoder.encode(&op).unwrap();
8124        // MOVW + MOVT for lo (8 bytes) + MOVW + MOVT for hi (8 bytes) = 16 bytes
8125        assert_eq!(
8126            code.len(),
8127            16,
8128            "I64Const with large value should be 16 bytes"
8129        );
8130    }
8131
8132    #[test]
8133    fn test_encode_i64_extend_i32_s_thumb2() {
8134        let encoder = ArmEncoder::new_thumb2();
8135        let op = ArmOp::I64ExtendI32S {
8136            rdlo: Reg::R0,
8137            rdhi: Reg::R1,
8138            rn: Reg::R0,
8139        };
8140        let code = encoder.encode(&op).unwrap();
8141        // When rdlo == rn, only ASR (4 bytes) is emitted
8142        assert_eq!(
8143            code.len(),
8144            4,
8145            "I64ExtendI32S (same reg) should be 4 bytes (ASR only)"
8146        );
8147    }
8148
8149    #[test]
8150    fn test_encode_i64_extend_i32_s_diff_reg_thumb2() {
8151        let encoder = ArmEncoder::new_thumb2();
8152        let op = ArmOp::I64ExtendI32S {
8153            rdlo: Reg::R0,
8154            rdhi: Reg::R1,
8155            rn: Reg::R2,
8156        };
8157        let code = encoder.encode(&op).unwrap();
8158        // MOV rdlo, rn (2 bytes for low regs) + ASR rdhi, rdlo, #31 (4 bytes) = 6 bytes
8159        assert!(
8160            code.len() >= 6,
8161            "I64ExtendI32S (diff reg) should be at least 6 bytes"
8162        );
8163    }
8164
8165    #[test]
8166    fn test_encode_i64_extend_i32_u_thumb2() {
8167        let encoder = ArmEncoder::new_thumb2();
8168        let op = ArmOp::I64ExtendI32U {
8169            rdlo: Reg::R0,
8170            rdhi: Reg::R1,
8171            rn: Reg::R0,
8172        };
8173        let code = encoder.encode(&op).unwrap();
8174        // When rdlo == rn, only MOV rdhi, #0 (2 bytes) is emitted
8175        assert_eq!(
8176            code.len(),
8177            2,
8178            "I64ExtendI32U (same reg) should be 2 bytes (MOV #0 only)"
8179        );
8180    }
8181
8182    #[test]
8183    fn test_encode_i32_wrap_i64_nop_thumb2() {
8184        let encoder = ArmEncoder::new_thumb2();
8185        // When rd == rnlo, should be a NOP
8186        let op = ArmOp::I32WrapI64 {
8187            rd: Reg::R0,
8188            rnlo: Reg::R0,
8189        };
8190        let code = encoder.encode(&op).unwrap();
8191        assert_eq!(code.len(), 2, "I32WrapI64 same reg should be NOP (2 bytes)");
8192        assert_eq!(code, vec![0x00, 0xBF]); // NOP
8193    }
8194
8195    #[test]
8196    fn test_encode_i32_wrap_i64_diff_reg_thumb2() {
8197        let encoder = ArmEncoder::new_thumb2();
8198        let op = ArmOp::I32WrapI64 {
8199            rd: Reg::R2,
8200            rnlo: Reg::R0,
8201        };
8202        let code = encoder.encode(&op).unwrap();
8203        // MOV R2, R0 (2 or 4 bytes)
8204        assert!(
8205            code.len() >= 2,
8206            "I32WrapI64 diff reg should emit at least 2 bytes"
8207        );
8208    }
8209
8210    #[test]
8211    fn test_encode_i64_eqz_thumb2() {
8212        let encoder = ArmEncoder::new_thumb2();
8213        let op = ArmOp::I64Eqz {
8214            rd: Reg::R0,
8215            rnlo: Reg::R0,
8216            rnhi: Reg::R1,
8217        };
8218        let code = encoder.encode(&op).unwrap();
8219        // Delegates to I64SetCondZ which is already encoded
8220        assert!(
8221            code.len() >= 6,
8222            "I64Eqz should emit at least 6 bytes for ORR+ITE+MOV+MOV"
8223        );
8224    }
8225
8226    #[test]
8227    fn test_encode_i64_eq_thumb2() {
8228        let encoder = ArmEncoder::new_thumb2();
8229        let op = ArmOp::I64Eq {
8230            rd: Reg::R0,
8231            rnlo: Reg::R0,
8232            rnhi: Reg::R1,
8233            rmlo: Reg::R2,
8234            rmhi: Reg::R3,
8235        };
8236        let code = encoder.encode(&op).unwrap();
8237        // Delegates to I64SetCond EQ: CMP lo + IT EQ + CMPEQ hi + ITE EQ + MOV 1 + MOV 0
8238        assert!(code.len() >= 10, "I64Eq should emit at least 10 bytes");
8239    }
8240
8241    #[test]
8242    fn test_encode_i64_ldr_thumb2() {
8243        let encoder = ArmEncoder::new_thumb2();
8244        let op = ArmOp::I64Ldr {
8245            rdlo: Reg::R0,
8246            rdhi: Reg::R1,
8247            addr: MemAddr::imm(Reg::SP, 0),
8248        };
8249        let code = encoder.encode(&op).unwrap();
8250        // Two LDR instructions (lo at offset, hi at offset+4)
8251        assert!(code.len() >= 4, "I64Ldr should emit at least 4 bytes");
8252    }
8253
8254    #[test]
8255    fn test_encode_i64_str_thumb2() {
8256        let encoder = ArmEncoder::new_thumb2();
8257        let op = ArmOp::I64Str {
8258            rdlo: Reg::R0,
8259            rdhi: Reg::R1,
8260            addr: MemAddr::imm(Reg::SP, 0),
8261        };
8262        let code = encoder.encode(&op).unwrap();
8263        // Two STR instructions (lo at offset, hi at offset+4)
8264        assert!(code.len() >= 4, "I64Str should emit at least 4 bytes");
8265    }
8266
8267    #[test]
8268    fn test_encode_i64_all_comparisons_thumb2() {
8269        let encoder = ArmEncoder::new_thumb2();
8270
8271        let ops = vec![
8272            ArmOp::I64Ne {
8273                rd: Reg::R0,
8274                rnlo: Reg::R0,
8275                rnhi: Reg::R1,
8276                rmlo: Reg::R2,
8277                rmhi: Reg::R3,
8278            },
8279            ArmOp::I64LtS {
8280                rd: Reg::R0,
8281                rnlo: Reg::R0,
8282                rnhi: Reg::R1,
8283                rmlo: Reg::R2,
8284                rmhi: Reg::R3,
8285            },
8286            ArmOp::I64LtU {
8287                rd: Reg::R0,
8288                rnlo: Reg::R0,
8289                rnhi: Reg::R1,
8290                rmlo: Reg::R2,
8291                rmhi: Reg::R3,
8292            },
8293            ArmOp::I64LeS {
8294                rd: Reg::R0,
8295                rnlo: Reg::R0,
8296                rnhi: Reg::R1,
8297                rmlo: Reg::R2,
8298                rmhi: Reg::R3,
8299            },
8300            ArmOp::I64LeU {
8301                rd: Reg::R0,
8302                rnlo: Reg::R0,
8303                rnhi: Reg::R1,
8304                rmlo: Reg::R2,
8305                rmhi: Reg::R3,
8306            },
8307            ArmOp::I64GtS {
8308                rd: Reg::R0,
8309                rnlo: Reg::R0,
8310                rnhi: Reg::R1,
8311                rmlo: Reg::R2,
8312                rmhi: Reg::R3,
8313            },
8314            ArmOp::I64GtU {
8315                rd: Reg::R0,
8316                rnlo: Reg::R0,
8317                rnhi: Reg::R1,
8318                rmlo: Reg::R2,
8319                rmhi: Reg::R3,
8320            },
8321            ArmOp::I64GeS {
8322                rd: Reg::R0,
8323                rnlo: Reg::R0,
8324                rnhi: Reg::R1,
8325                rmlo: Reg::R2,
8326                rmhi: Reg::R3,
8327            },
8328            ArmOp::I64GeU {
8329                rd: Reg::R0,
8330                rnlo: Reg::R0,
8331                rnhi: Reg::R1,
8332                rmlo: Reg::R2,
8333                rmhi: Reg::R3,
8334            },
8335        ];
8336
8337        for op in &ops {
8338            let code = encoder.encode(op).unwrap();
8339            assert!(
8340                code.len() >= 8,
8341                "i64 comparison {:?} should emit at least 8 bytes, got {}",
8342                op,
8343                code.len()
8344            );
8345        }
8346    }
8347
8348    #[test]
8349    fn test_encode_i64_const_zero_thumb2() {
8350        let encoder = ArmEncoder::new_thumb2();
8351        let op = ArmOp::I64Const {
8352            rdlo: Reg::R0,
8353            rdhi: Reg::R1,
8354            value: 0,
8355        };
8356        let code = encoder.encode(&op).unwrap();
8357        // MOVW R0, #0 (4 bytes) + MOVW R1, #0 (4 bytes) = 8 bytes
8358        assert_eq!(code.len(), 8, "I64Const(0) should be 8 bytes");
8359    }
8360
8361    #[test]
8362    fn test_encode_i64_const_negative_one_thumb2() {
8363        let encoder = ArmEncoder::new_thumb2();
8364        let op = ArmOp::I64Const {
8365            rdlo: Reg::R0,
8366            rdhi: Reg::R1,
8367            value: -1, // 0xFFFF_FFFF_FFFF_FFFF
8368        };
8369        let code = encoder.encode(&op).unwrap();
8370        // MOVW + MOVT for lo (8 bytes) + MOVW + MOVT for hi (8 bytes) = 16 bytes
8371        assert_eq!(code.len(), 16, "I64Const(-1) should be 16 bytes");
8372    }
8373
8374    // =========================================================================
8375    // Sub-word load/store encoding tests
8376    // =========================================================================
8377
8378    #[test]
8379    fn test_encode_ldrb_arm32() {
8380        let encoder = ArmEncoder::new_arm32();
8381        let op = ArmOp::Ldrb {
8382            rd: Reg::R0,
8383            addr: MemAddr::imm(Reg::R1, 4),
8384        };
8385        let code = encoder.encode(&op).unwrap();
8386        assert_eq!(code.len(), 4, "ARM32 LDRB should be 4 bytes");
8387        // LDRB R0, [R1, #4] = 0xE5D10004
8388        let encoded = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
8389        assert_eq!(encoded, 0xE5D10004, "Should encode LDRB R0, [R1, #4]");
8390    }
8391
8392    #[test]
8393    fn test_encode_strb_arm32() {
8394        let encoder = ArmEncoder::new_arm32();
8395        let op = ArmOp::Strb {
8396            rd: Reg::R0,
8397            addr: MemAddr::imm(Reg::R1, 0),
8398        };
8399        let code = encoder.encode(&op).unwrap();
8400        assert_eq!(code.len(), 4, "ARM32 STRB should be 4 bytes");
8401        // STRB R0, [R1, #0] = 0xE5C10000
8402        let encoded = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
8403        assert_eq!(encoded, 0xE5C10000, "Should encode STRB R0, [R1, #0]");
8404    }
8405
8406    #[test]
8407    fn test_encode_ldrh_arm32() {
8408        let encoder = ArmEncoder::new_arm32();
8409        let op = ArmOp::Ldrh {
8410            rd: Reg::R0,
8411            addr: MemAddr::imm(Reg::R1, 2),
8412        };
8413        let code = encoder.encode(&op).unwrap();
8414        assert_eq!(code.len(), 4, "ARM32 LDRH should be 4 bytes");
8415    }
8416
8417    #[test]
8418    fn test_encode_strh_arm32() {
8419        let encoder = ArmEncoder::new_arm32();
8420        let op = ArmOp::Strh {
8421            rd: Reg::R0,
8422            addr: MemAddr::imm(Reg::R1, 0),
8423        };
8424        let code = encoder.encode(&op).unwrap();
8425        assert_eq!(code.len(), 4, "ARM32 STRH should be 4 bytes");
8426    }
8427
8428    #[test]
8429    fn test_encode_ldrsb_arm32() {
8430        let encoder = ArmEncoder::new_arm32();
8431        let op = ArmOp::Ldrsb {
8432            rd: Reg::R0,
8433            addr: MemAddr::imm(Reg::R1, 0),
8434        };
8435        let code = encoder.encode(&op).unwrap();
8436        assert_eq!(code.len(), 4, "ARM32 LDRSB should be 4 bytes");
8437    }
8438
8439    #[test]
8440    fn test_encode_ldrsh_arm32() {
8441        let encoder = ArmEncoder::new_arm32();
8442        let op = ArmOp::Ldrsh {
8443            rd: Reg::R0,
8444            addr: MemAddr::imm(Reg::R1, 0),
8445        };
8446        let code = encoder.encode(&op).unwrap();
8447        assert_eq!(code.len(), 4, "ARM32 LDRSH should be 4 bytes");
8448    }
8449
8450    #[test]
8451    fn test_encode_ldrb_thumb2_16bit() {
8452        let encoder = ArmEncoder::new_thumb2();
8453        let op = ArmOp::Ldrb {
8454            rd: Reg::R0,
8455            addr: MemAddr::imm(Reg::R1, 4),
8456        };
8457        let code = encoder.encode(&op).unwrap();
8458        // Low registers + small offset -> 16-bit encoding
8459        assert_eq!(
8460            code.len(),
8461            2,
8462            "Thumb-2 LDRB with small offset should be 16-bit"
8463        );
8464    }
8465
8466    #[test]
8467    fn test_encode_ldrb_thumb2_32bit() {
8468        let encoder = ArmEncoder::new_thumb2();
8469        let op = ArmOp::Ldrb {
8470            rd: Reg::R0,
8471            addr: MemAddr::imm(Reg::R1, 100), // offset > 31 needs 32-bit
8472        };
8473        let code = encoder.encode(&op).unwrap();
8474        assert_eq!(
8475            code.len(),
8476            4,
8477            "Thumb-2 LDRB with large offset should be 32-bit"
8478        );
8479    }
8480
8481    #[test]
8482    fn test_encode_strb_thumb2_16bit() {
8483        let encoder = ArmEncoder::new_thumb2();
8484        let op = ArmOp::Strb {
8485            rd: Reg::R0,
8486            addr: MemAddr::imm(Reg::R1, 10),
8487        };
8488        let code = encoder.encode(&op).unwrap();
8489        assert_eq!(
8490            code.len(),
8491            2,
8492            "Thumb-2 STRB with small offset should be 16-bit"
8493        );
8494    }
8495
8496    #[test]
8497    fn test_encode_ldrh_thumb2_16bit() {
8498        let encoder = ArmEncoder::new_thumb2();
8499        let op = ArmOp::Ldrh {
8500            rd: Reg::R0,
8501            addr: MemAddr::imm(Reg::R1, 4), // offset aligned to 2, <= 62
8502        };
8503        let code = encoder.encode(&op).unwrap();
8504        assert_eq!(
8505            code.len(),
8506            2,
8507            "Thumb-2 LDRH with small aligned offset should be 16-bit"
8508        );
8509    }
8510
8511    #[test]
8512    fn test_encode_strh_thumb2_16bit() {
8513        let encoder = ArmEncoder::new_thumb2();
8514        let op = ArmOp::Strh {
8515            rd: Reg::R0,
8516            addr: MemAddr::imm(Reg::R1, 4),
8517        };
8518        let code = encoder.encode(&op).unwrap();
8519        assert_eq!(
8520            code.len(),
8521            2,
8522            "Thumb-2 STRH with small aligned offset should be 16-bit"
8523        );
8524    }
8525
8526    #[test]
8527    fn test_encode_ldrsb_thumb2() {
8528        let encoder = ArmEncoder::new_thumb2();
8529        let op = ArmOp::Ldrsb {
8530            rd: Reg::R0,
8531            addr: MemAddr::imm(Reg::R1, 0),
8532        };
8533        let code = encoder.encode(&op).unwrap();
8534        // LDRSB has no 16-bit immediate form, always 32-bit
8535        assert_eq!(code.len(), 4, "Thumb-2 LDRSB should be 32-bit");
8536    }
8537
8538    #[test]
8539    fn test_encode_ldrsh_thumb2() {
8540        let encoder = ArmEncoder::new_thumb2();
8541        let op = ArmOp::Ldrsh {
8542            rd: Reg::R0,
8543            addr: MemAddr::imm(Reg::R1, 0),
8544        };
8545        let code = encoder.encode(&op).unwrap();
8546        assert_eq!(code.len(), 4, "Thumb-2 LDRSH should be 32-bit");
8547    }
8548
8549    #[test]
8550    fn test_encode_memory_size_thumb2() {
8551        let encoder = ArmEncoder::new_thumb2();
8552        let op = ArmOp::MemorySize { rd: Reg::R0 };
8553        let code = encoder.encode(&op).unwrap();
8554        // R0 and R10 are not both low registers, so this needs careful handling
8555        assert!(!code.is_empty(), "MemorySize should produce code");
8556    }
8557
8558    #[test]
8559    fn test_encode_memory_grow_thumb2() {
8560        let encoder = ArmEncoder::new_thumb2();
8561        let op = ArmOp::MemoryGrow {
8562            rd: Reg::R0,
8563            rn: Reg::R0,
8564        };
8565        let code = encoder.encode(&op).unwrap();
8566        assert_eq!(code.len(), 4, "MemoryGrow (MVN) should be 32-bit Thumb-2");
8567    }
8568
8569    #[test]
8570    fn test_encode_subword_reg_offset_thumb2() {
8571        let encoder = ArmEncoder::new_thumb2();
8572
8573        // LDRB with register offset
8574        let op = ArmOp::Ldrb {
8575            rd: Reg::R0,
8576            addr: MemAddr::reg(Reg::R1, Reg::R2),
8577        };
8578        let code = encoder.encode(&op).unwrap();
8579        assert_eq!(
8580            code.len(),
8581            4,
8582            "Thumb-2 LDRB with reg offset should be 32-bit"
8583        );
8584
8585        // STRB with register offset
8586        let op = ArmOp::Strb {
8587            rd: Reg::R0,
8588            addr: MemAddr::reg(Reg::R1, Reg::R2),
8589        };
8590        let code = encoder.encode(&op).unwrap();
8591        assert_eq!(
8592            code.len(),
8593            4,
8594            "Thumb-2 STRB with reg offset should be 32-bit"
8595        );
8596
8597        // LDRH with register offset
8598        let op = ArmOp::Ldrh {
8599            rd: Reg::R0,
8600            addr: MemAddr::reg(Reg::R1, Reg::R2),
8601        };
8602        let code = encoder.encode(&op).unwrap();
8603        assert_eq!(
8604            code.len(),
8605            4,
8606            "Thumb-2 LDRH with reg offset should be 32-bit"
8607        );
8608
8609        // STRH with register offset
8610        let op = ArmOp::Strh {
8611            rd: Reg::R0,
8612            addr: MemAddr::reg(Reg::R1, Reg::R2),
8613        };
8614        let code = encoder.encode(&op).unwrap();
8615        assert_eq!(
8616            code.len(),
8617            4,
8618            "Thumb-2 STRH with reg offset should be 32-bit"
8619        );
8620    }
8621
8622    #[test]
8623    fn test_encode_subword_reg_imm_offset_thumb2() {
8624        let encoder = ArmEncoder::new_thumb2();
8625
8626        // LDRB with both register and immediate offset
8627        let op = ArmOp::Ldrb {
8628            rd: Reg::R0,
8629            addr: MemAddr::reg_imm(Reg::R1, Reg::R2, 4),
8630        };
8631        let code = encoder.encode(&op).unwrap();
8632        // ADD R12, R2, #4 (4 bytes) + LDRB R0, [R1, R12] (4 bytes) = 8 bytes
8633        assert_eq!(
8634            code.len(),
8635            8,
8636            "Thumb-2 LDRB with reg+imm offset should be 8 bytes"
8637        );
8638    }
8639
8640    // ========================================================================
8641    // Helium MVE encoding tests
8642    // ========================================================================
8643
8644    #[test]
8645    fn test_encode_mve_addi32_thumb2() {
8646        let encoder = ArmEncoder::new_thumb2();
8647        let op = ArmOp::MveAddI {
8648            qd: QReg::Q0,
8649            qn: QReg::Q1,
8650            qm: QReg::Q2,
8651            size: MveSize::S32,
8652        };
8653        let code = encoder.encode(&op).unwrap();
8654        assert_eq!(
8655            code.len(),
8656            4,
8657            "MVE VADD.I32 should be 4 bytes (Thumb-2 32-bit)"
8658        );
8659    }
8660
8661    #[test]
8662    fn test_encode_mve_subi16_thumb2() {
8663        let encoder = ArmEncoder::new_thumb2();
8664        let op = ArmOp::MveSubI {
8665            qd: QReg::Q0,
8666            qn: QReg::Q1,
8667            qm: QReg::Q2,
8668            size: MveSize::S16,
8669        };
8670        let code = encoder.encode(&op).unwrap();
8671        assert_eq!(code.len(), 4, "MVE VSUB.I16 should be 4 bytes");
8672    }
8673
8674    #[test]
8675    fn test_encode_mve_muli8_thumb2() {
8676        let encoder = ArmEncoder::new_thumb2();
8677        let op = ArmOp::MveMulI {
8678            qd: QReg::Q0,
8679            qn: QReg::Q1,
8680            qm: QReg::Q2,
8681            size: MveSize::S8,
8682        };
8683        let code = encoder.encode(&op).unwrap();
8684        assert_eq!(code.len(), 4, "MVE VMUL.I8 should be 4 bytes");
8685    }
8686
8687    #[test]
8688    fn test_encode_mve_bitwise_thumb2() {
8689        let encoder = ArmEncoder::new_thumb2();
8690
8691        let ops = vec![
8692            ArmOp::MveAnd {
8693                qd: QReg::Q0,
8694                qn: QReg::Q1,
8695                qm: QReg::Q2,
8696            },
8697            ArmOp::MveOrr {
8698                qd: QReg::Q0,
8699                qn: QReg::Q1,
8700                qm: QReg::Q2,
8701            },
8702            ArmOp::MveEor {
8703                qd: QReg::Q0,
8704                qn: QReg::Q1,
8705                qm: QReg::Q2,
8706            },
8707            ArmOp::MveBic {
8708                qd: QReg::Q0,
8709                qn: QReg::Q1,
8710                qm: QReg::Q2,
8711            },
8712        ];
8713        for op in ops {
8714            let code = encoder.encode(&op).unwrap();
8715            assert_eq!(code.len(), 4, "MVE bitwise op should be 4 bytes");
8716        }
8717    }
8718
8719    #[test]
8720    fn test_encode_mve_mvn_thumb2() {
8721        let encoder = ArmEncoder::new_thumb2();
8722        let op = ArmOp::MveMvn {
8723            qd: QReg::Q0,
8724            qm: QReg::Q1,
8725        };
8726        let code = encoder.encode(&op).unwrap();
8727        assert_eq!(code.len(), 4, "MVE VMVN should be 4 bytes");
8728    }
8729
8730    #[test]
8731    fn test_encode_mve_load_store_thumb2() {
8732        let encoder = ArmEncoder::new_thumb2();
8733
8734        let load = ArmOp::MveLoad {
8735            qd: QReg::Q0,
8736            addr: MemAddr::imm(Reg::R0, 16),
8737        };
8738        let code = encoder.encode(&load).unwrap();
8739        assert_eq!(code.len(), 4, "MVE VLDRW.32 should be 4 bytes");
8740
8741        let store = ArmOp::MveStore {
8742            qd: QReg::Q1,
8743            addr: MemAddr::imm(Reg::R1, 0),
8744        };
8745        let code = encoder.encode(&store).unwrap();
8746        assert_eq!(code.len(), 4, "MVE VSTRW.32 should be 4 bytes");
8747    }
8748
8749    #[test]
8750    fn test_encode_mve_const_thumb2() {
8751        let encoder = ArmEncoder::new_thumb2();
8752        let op = ArmOp::MveConst {
8753            qd: QReg::Q0,
8754            bytes: [1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0],
8755        };
8756        let code = encoder.encode(&op).unwrap();
8757        // Should be 4 words of (MOVW R12 + VMOV Sn) = 4 * (4+4) = 32 bytes min
8758        // Some words with hi16=0 skip MOVT, so length varies
8759        assert!(
8760            code.len() >= 24,
8761            "MVE const should produce multiple instructions"
8762        );
8763    }
8764
8765    #[test]
8766    fn test_encode_mve_dup_thumb2() {
8767        let encoder = ArmEncoder::new_thumb2();
8768        let op = ArmOp::MveDup {
8769            qd: QReg::Q0,
8770            rn: Reg::R0,
8771            size: MveSize::S32,
8772        };
8773        let code = encoder.encode(&op).unwrap();
8774        assert_eq!(code.len(), 4, "MVE VDUP.32 should be 4 bytes");
8775    }
8776
8777    #[test]
8778    fn test_encode_mve_extract_lane_thumb2() {
8779        let encoder = ArmEncoder::new_thumb2();
8780        let op = ArmOp::MveExtractLane {
8781            rd: Reg::R0,
8782            qn: QReg::Q1,
8783            lane: 2,
8784            size: MveSize::S32,
8785        };
8786        let code = encoder.encode(&op).unwrap();
8787        assert_eq!(code.len(), 4, "MVE extract lane should be 4 bytes");
8788    }
8789
8790    #[test]
8791    fn test_encode_mve_insert_lane_thumb2() {
8792        let encoder = ArmEncoder::new_thumb2();
8793        let op = ArmOp::MveInsertLane {
8794            qd: QReg::Q0,
8795            rn: Reg::R1,
8796            lane: 3,
8797            size: MveSize::S32,
8798        };
8799        let code = encoder.encode(&op).unwrap();
8800        assert_eq!(code.len(), 4, "MVE insert lane should be 4 bytes");
8801    }
8802
8803    #[test]
8804    fn test_encode_mve_addf32_thumb2() {
8805        let encoder = ArmEncoder::new_thumb2();
8806        let op = ArmOp::MveAddF32 {
8807            qd: QReg::Q0,
8808            qn: QReg::Q1,
8809            qm: QReg::Q2,
8810        };
8811        let code = encoder.encode(&op).unwrap();
8812        assert_eq!(code.len(), 4, "MVE VADD.F32 should be 4 bytes");
8813    }
8814
8815    #[test]
8816    fn test_encode_mve_divf32_thumb2() {
8817        let encoder = ArmEncoder::new_thumb2();
8818        let op = ArmOp::MveDivF32 {
8819            qd: QReg::Q0,
8820            qn: QReg::Q1,
8821            qm: QReg::Q2,
8822        };
8823        let code = encoder.encode(&op).unwrap();
8824        // Lane-wise: 4 x VDIV.F32 = 4 x 4 = 16 bytes
8825        assert_eq!(
8826            code.len(),
8827            16,
8828            "MVE VDIV.F32 (lane-wise) should be 16 bytes"
8829        );
8830    }
8831
8832    #[test]
8833    fn test_encode_mve_sqrtf32_thumb2() {
8834        let encoder = ArmEncoder::new_thumb2();
8835        let op = ArmOp::MveSqrtF32 {
8836            qd: QReg::Q0,
8837            qm: QReg::Q1,
8838        };
8839        let code = encoder.encode(&op).unwrap();
8840        // Lane-wise: 4 x VSQRT.F32 = 4 x 4 = 16 bytes
8841        assert_eq!(
8842            code.len(),
8843            16,
8844            "MVE VSQRT.F32 (lane-wise) should be 16 bytes"
8845        );
8846    }
8847
8848    #[test]
8849    fn test_encode_mve_negf32_thumb2() {
8850        let encoder = ArmEncoder::new_thumb2();
8851        let op = ArmOp::MveNegF32 {
8852            qd: QReg::Q0,
8853            qm: QReg::Q1,
8854        };
8855        let code = encoder.encode(&op).unwrap();
8856        assert_eq!(code.len(), 4, "MVE VNEG.F32 should be 4 bytes");
8857    }
8858
8859    #[test]
8860    fn test_encode_mve_absf32_thumb2() {
8861        let encoder = ArmEncoder::new_thumb2();
8862        let op = ArmOp::MveAbsF32 {
8863            qd: QReg::Q0,
8864            qm: QReg::Q1,
8865        };
8866        let code = encoder.encode(&op).unwrap();
8867        assert_eq!(code.len(), 4, "MVE VABS.F32 should be 4 bytes");
8868    }
8869
8870    #[test]
8871    fn test_encode_mve_different_qregs() {
8872        let encoder = ArmEncoder::new_thumb2();
8873
8874        // Test that different Q-register numbers produce different encodings
8875        let op1 = ArmOp::MveAddI {
8876            qd: QReg::Q0,
8877            qn: QReg::Q0,
8878            qm: QReg::Q0,
8879            size: MveSize::S32,
8880        };
8881        let op2 = ArmOp::MveAddI {
8882            qd: QReg::Q3,
8883            qn: QReg::Q5,
8884            qm: QReg::Q7,
8885            size: MveSize::S32,
8886        };
8887        let code1 = encoder.encode(&op1).unwrap();
8888        let code2 = encoder.encode(&op2).unwrap();
8889        assert_ne!(
8890            code1, code2,
8891            "Different Q-registers should produce different encodings"
8892        );
8893    }
8894
8895    #[test]
8896    fn test_encode_mve_arm32_nop() {
8897        // MVE instructions on ARM32 encoder should produce NOP (only Thumb-2 supported)
8898        let encoder = ArmEncoder::new_arm32();
8899        let op = ArmOp::MveAddI {
8900            qd: QReg::Q0,
8901            qn: QReg::Q1,
8902            qm: QReg::Q2,
8903            size: MveSize::S32,
8904        };
8905        let code = encoder.encode(&op).unwrap();
8906        assert_eq!(code.len(), 4, "ARM32 MVE should be 4 bytes (NOP)");
8907        // NOP in ARM32 is 0xE1A00000 (MOV R0, R0)
8908        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
8909        assert_eq!(instr, 0xE1A00000, "ARM32 MVE should encode as NOP");
8910    }
8911}