Skip to main content

synth_backend/
arm_encoder.rs

1//! ARM Code Encoder - Converts ARM instructions to binary machine code
2//!
3//! Generates ARM32/Thumb-2 machine code from ARM instruction structures
4
5use synth_core::Result;
6use synth_core::target::FPUPrecision;
7use synth_synthesis::contracts::encoding as encoding_contracts;
8use synth_synthesis::{ArmOp, MemAddr, MveSize, Operand2, QReg, Reg, VfpReg};
9
10/// ARM instruction encoding
11pub struct ArmEncoder {
12    /// Use Thumb mode (vs ARM mode)
13    thumb_mode: bool,
14    /// FPU capability for VFP instruction encoding
15    #[allow(dead_code)]
16    fpu: Option<FPUPrecision>,
17}
18
19impl ArmEncoder {
20    /// Create a new ARM encoder in ARM32 mode
21    pub fn new_arm32() -> Self {
22        Self {
23            thumb_mode: false,
24            fpu: None,
25        }
26    }
27
28    /// Create a new ARM encoder in Thumb-2 mode
29    pub fn new_thumb2() -> Self {
30        Self {
31            thumb_mode: true,
32            fpu: None,
33        }
34    }
35
36    /// Create a new Thumb-2 encoder with FPU capability
37    pub fn new_thumb2_with_fpu(fpu: Option<FPUPrecision>) -> Self {
38        Self {
39            thumb_mode: true,
40            fpu,
41        }
42    }
43
44    /// Encode a single ARM instruction to bytes
45    pub fn encode(&self, op: &ArmOp) -> Result<Vec<u8>> {
46        if self.thumb_mode {
47            self.encode_thumb(op)
48        } else {
49            self.encode_arm(op)
50        }
51    }
52
53    /// Encode an ARM instruction in ARM32 mode (32-bit instructions)
54    /// #206: encode an ARM32 (A32) load/store whose address uses a register
55    /// offset (`[rn, rm{, #off}]`). Returns `None` for ops with no register
56    /// offset (the caller falls through to the immediate-form arms). Computes
57    /// `ip = base + rm` then re-encodes the op against `[ip, #off]`, which works
58    /// uniformly for word/byte/halfword/signed forms. IP (R12) is the scratch
59    /// register the selector already treats as clobberable across memory ops.
60    fn encode_arm_reg_offset_mem(&self, op: &ArmOp) -> Result<Option<Vec<u8>>> {
61        use synth_synthesis::Reg;
62        let addr = match op {
63            ArmOp::Ldr { addr, .. }
64            | ArmOp::Str { addr, .. }
65            | ArmOp::Ldrb { addr, .. }
66            | ArmOp::Strb { addr, .. }
67            | ArmOp::Ldrh { addr, .. }
68            | ArmOp::Strh { addr, .. }
69            | ArmOp::Ldrsb { addr, .. }
70            | ArmOp::Ldrsh { addr, .. } => addr,
71            _ => return Ok(None),
72        };
73        let Some(rm) = addr.offset_reg else {
74            return Ok(None);
75        };
76        let ip = Reg::R12;
77        // ADD ip, base, rm  (cond=AL, opcode=ADD, S=0, register operand2)
78        let add: u32 = 0xE0800000
79            | (reg_to_bits(&addr.base) << 16)
80            | (reg_to_bits(&ip) << 12)
81            | reg_to_bits(&rm);
82        let mut bytes = add.to_le_bytes().to_vec();
83        // Re-encode the op against [ip, #off] (immediate form → no offset_reg,
84        // so this recursion hits the immediate arms, not this helper again).
85        let imm_addr = MemAddr::imm(ip, addr.offset);
86        let imm_op = match op {
87            ArmOp::Ldr { rd, .. } => ArmOp::Ldr {
88                rd: *rd,
89                addr: imm_addr,
90            },
91            ArmOp::Str { rd, .. } => ArmOp::Str {
92                rd: *rd,
93                addr: imm_addr,
94            },
95            ArmOp::Ldrb { rd, .. } => ArmOp::Ldrb {
96                rd: *rd,
97                addr: imm_addr,
98            },
99            ArmOp::Strb { rd, .. } => ArmOp::Strb {
100                rd: *rd,
101                addr: imm_addr,
102            },
103            ArmOp::Ldrh { rd, .. } => ArmOp::Ldrh {
104                rd: *rd,
105                addr: imm_addr,
106            },
107            ArmOp::Strh { rd, .. } => ArmOp::Strh {
108                rd: *rd,
109                addr: imm_addr,
110            },
111            ArmOp::Ldrsb { rd, .. } => ArmOp::Ldrsb {
112                rd: *rd,
113                addr: imm_addr,
114            },
115            ArmOp::Ldrsh { rd, .. } => ArmOp::Ldrsh {
116                rd: *rd,
117                addr: imm_addr,
118            },
119            _ => unreachable!(),
120        };
121        bytes.extend(self.encode_arm(&imm_op)?);
122        Ok(Some(bytes))
123    }
124
125    fn encode_arm(&self, op: &ArmOp) -> Result<Vec<u8>> {
126        // #206: ARM32 register-offset loads/stores. `encode_mem_addr` only
127        // returns the 12-bit immediate, so the immediate-form arms below
128        // silently DROP `addr.offset_reg` — a runtime address index vanished,
129        // turning `ldr rd,[rn,rm,#off]` into `ldr rd,[rn,#off]` (the access went
130        // to the wrong address). Compute the effective base into IP and re-encode
131        // against `[ip, #off]`, which is uniform for word/byte/halfword/signed.
132        if let Some(bytes) = self.encode_arm_reg_offset_mem(op)? {
133            return Ok(bytes);
134        }
135        let instr: u32 = match op {
136            // Data processing instructions
137            ArmOp::Add { rd, rn, op2 } => {
138                let rd_bits = reg_to_bits(rd);
139                let rn_bits = reg_to_bits(rn);
140                let (op2_bits, i_flag) = encode_operand2(op2);
141
142                // ADD encoding: cond(4) | 00 | I(1) | 0100 | S(1) | Rn(4) | Rd(4) | operand2(12)
143                0xE0800000 // condition=always(E), opcode=ADD(0100), S=0
144                    | (i_flag << 25)
145                    | (rn_bits << 16)
146                    | (rd_bits << 12)
147                    | op2_bits
148            }
149
150            ArmOp::Sub { rd, rn, op2 } => {
151                let rd_bits = reg_to_bits(rd);
152                let rn_bits = reg_to_bits(rn);
153                let (op2_bits, i_flag) = encode_operand2(op2);
154
155                // SUB encoding: opcode=0010
156                0xE0400000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
157            }
158
159            // i64 support: ADDS, ADC, SUBS, SBC for ARM32
160            ArmOp::Adds { rd, rn, op2 } => {
161                let rd_bits = reg_to_bits(rd);
162                let rn_bits = reg_to_bits(rn);
163                let (op2_bits, i_flag) = encode_operand2(op2);
164
165                // ADDS encoding: opcode=0100, S=1
166                0xE0900000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
167            }
168
169            ArmOp::Adc { rd, rn, op2 } => {
170                let rd_bits = reg_to_bits(rd);
171                let rn_bits = reg_to_bits(rn);
172                let (op2_bits, i_flag) = encode_operand2(op2);
173
174                // ADC encoding: opcode=0101
175                0xE0A00000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
176            }
177
178            ArmOp::Subs { rd, rn, op2 } => {
179                let rd_bits = reg_to_bits(rd);
180                let rn_bits = reg_to_bits(rn);
181                let (op2_bits, i_flag) = encode_operand2(op2);
182
183                // SUBS encoding: opcode=0010, S=1
184                0xE0500000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
185            }
186
187            ArmOp::Sbc { rd, rn, op2 } => {
188                let rd_bits = reg_to_bits(rd);
189                let rn_bits = reg_to_bits(rn);
190                let (op2_bits, i_flag) = encode_operand2(op2);
191
192                // SBC encoding: opcode=0110
193                0xE0C00000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
194            }
195
196            ArmOp::Mul { rd, rn, rm } => {
197                let rd_bits = reg_to_bits(rd);
198                let rn_bits = reg_to_bits(rn);
199                let rm_bits = reg_to_bits(rm);
200
201                // MUL encoding: cond(4) | 000000 | A(1) | S(1) | Rd(4) | Rn(4) | Rs(4) | 1001 | Rm(4)
202                0xE0000090 | (rd_bits << 16) | (rn_bits << 8) | rm_bits
203            }
204
205            ArmOp::Umull { rdlo, rdhi, rn, rm } => {
206                let rdlo_bits = reg_to_bits(rdlo);
207                let rdhi_bits = reg_to_bits(rdhi);
208                let rn_bits = reg_to_bits(rn);
209                let rm_bits = reg_to_bits(rm);
210
211                // UMULL encoding: cond(4) | 0000 1000 | RdHi(4) | RdLo(4) | Rm(4) | 1001 | Rn(4)
212                0xE0800090 | (rdhi_bits << 16) | (rdlo_bits << 12) | (rm_bits << 8) | rn_bits
213            }
214
215            ArmOp::Sdiv { rd, rn, rm } => {
216                let rd_bits = reg_to_bits(rd);
217                let rn_bits = reg_to_bits(rn);
218                let rm_bits = reg_to_bits(rm);
219
220                // SDIV encoding: cond(4) | 01110001 | Rd(4) | 1111 | Rm(4) | 0001 | Rn(4)
221                // ARMv7-M and above
222                0xE710F010 | (rd_bits << 16) | (rm_bits << 8) | rn_bits
223            }
224
225            ArmOp::Udiv { rd, rn, rm } => {
226                let rd_bits = reg_to_bits(rd);
227                let rn_bits = reg_to_bits(rn);
228                let rm_bits = reg_to_bits(rm);
229
230                // UDIV encoding: cond(4) | 01110011 | Rd(4) | 1111 | Rm(4) | 0001 | Rn(4)
231                // ARMv7-M and above
232                0xE730F010 | (rd_bits << 16) | (rm_bits << 8) | rn_bits
233            }
234
235            ArmOp::Mls { rd, rn, rm, ra } => {
236                let rd_bits = reg_to_bits(rd);
237                let rn_bits = reg_to_bits(rn);
238                let rm_bits = reg_to_bits(rm);
239                let ra_bits = reg_to_bits(ra);
240
241                // MLS encoding: cond(4) | 00000110 | Rd(4) | Ra(4) | Rm(4) | 1001 | Rn(4)
242                // Rd = Ra - (Rn * Rm)
243                0xE0600090 | (rd_bits << 16) | (ra_bits << 12) | (rm_bits << 8) | rn_bits
244            }
245
246            ArmOp::And { rd, rn, op2 } => {
247                let rd_bits = reg_to_bits(rd);
248                let rn_bits = reg_to_bits(rn);
249                let (op2_bits, i_flag) = encode_operand2(op2);
250
251                // AND encoding: opcode=0000
252                0xE0000000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
253            }
254
255            ArmOp::Orr { rd, rn, op2 } => {
256                let rd_bits = reg_to_bits(rd);
257                let rn_bits = reg_to_bits(rn);
258                let (op2_bits, i_flag) = encode_operand2(op2);
259
260                // ORR encoding: opcode=1100
261                0xE1800000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
262            }
263
264            ArmOp::Eor { rd, rn, op2 } => {
265                let rd_bits = reg_to_bits(rd);
266                let rn_bits = reg_to_bits(rn);
267                let (op2_bits, i_flag) = encode_operand2(op2);
268
269                // EOR encoding: opcode=0001
270                0xE0200000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
271            }
272
273            // Shift instructions
274            ArmOp::Lsl { rd, rn, shift } => {
275                let rd_bits = reg_to_bits(rd);
276                let rn_bits = reg_to_bits(rn);
277                let shift_bits = *shift & 0x1F;
278
279                // LSL encoding: MOV with shift
280                0xE1A00000 | (rd_bits << 12) | (shift_bits << 7) | rn_bits
281            }
282
283            ArmOp::Lsr { rd, rn, shift } => {
284                let rd_bits = reg_to_bits(rd);
285                let rn_bits = reg_to_bits(rn);
286                let shift_bits = *shift & 0x1F;
287
288                // LSR encoding
289                0xE1A00020 | (rd_bits << 12) | (shift_bits << 7) | rn_bits
290            }
291
292            ArmOp::Asr { rd, rn, shift } => {
293                let rd_bits = reg_to_bits(rd);
294                let rn_bits = reg_to_bits(rn);
295                let shift_bits = *shift & 0x1F;
296
297                // ASR encoding
298                0xE1A00040 | (rd_bits << 12) | (shift_bits << 7) | rn_bits
299            }
300
301            ArmOp::Ror { rd, rn, shift } => {
302                let rd_bits = reg_to_bits(rd);
303                let rn_bits = reg_to_bits(rn);
304                let shift_bits = *shift & 0x1F;
305
306                // ROR encoding: MOV with ROR shift
307                0xE1A00060 | (rd_bits << 12) | (shift_bits << 7) | rn_bits
308            }
309
310            // Register-based shifts (ARM32)
311            // LSL Rd, Rn, Rm: cond 0001101S 0000 Rd Rs 0001 Rn
312            ArmOp::LslReg { rd, rn, rm } => {
313                let rd_bits = reg_to_bits(rd);
314                let rn_bits = reg_to_bits(rn);
315                let rm_bits = reg_to_bits(rm);
316                0xE1A00010 | (rd_bits << 12) | (rm_bits << 8) | rn_bits
317            }
318            ArmOp::LsrReg { rd, rn, rm } => {
319                let rd_bits = reg_to_bits(rd);
320                let rn_bits = reg_to_bits(rn);
321                let rm_bits = reg_to_bits(rm);
322                0xE1A00030 | (rd_bits << 12) | (rm_bits << 8) | rn_bits
323            }
324            ArmOp::AsrReg { rd, rn, rm } => {
325                let rd_bits = reg_to_bits(rd);
326                let rn_bits = reg_to_bits(rn);
327                let rm_bits = reg_to_bits(rm);
328                0xE1A00050 | (rd_bits << 12) | (rm_bits << 8) | rn_bits
329            }
330            ArmOp::RorReg { rd, rn, rm } => {
331                let rd_bits = reg_to_bits(rd);
332                let rn_bits = reg_to_bits(rn);
333                let rm_bits = reg_to_bits(rm);
334                0xE1A00070 | (rd_bits << 12) | (rm_bits << 8) | rn_bits
335            }
336
337            // RSB (Reverse Subtract): Rd = imm - Rn
338            ArmOp::Rsb { rd, rn, imm } => {
339                let rd_bits = reg_to_bits(rd);
340                let rn_bits = reg_to_bits(rn);
341                // RSB encoding: cond(4) | 00 1 0011 S | Rn(4) | Rd(4) | imm12
342                // Opcode for RSB = 0011, I=1 (immediate), S=0
343                0xE2600000 | (rn_bits << 16) | (rd_bits << 12) | (*imm & 0xFF)
344            }
345
346            // Bit manipulation instructions
347            ArmOp::Clz { rd, rm } => {
348                let rd_bits = reg_to_bits(rd);
349                let rm_bits = reg_to_bits(rm);
350
351                // CLZ encoding: cond(4) | 00010110 | 1111 | Rd(4) | 1111 | 0001 | Rm(4)
352                // ARMv5T and above
353                0xE16F0F10 | (rd_bits << 12) | rm_bits
354            }
355
356            ArmOp::Rbit { rd, rm } => {
357                let rd_bits = reg_to_bits(rd);
358                let rm_bits = reg_to_bits(rm);
359
360                // RBIT encoding: cond(4) | 01101111 | 1111 | Rd(4) | 1111 | 0011 | Rm(4)
361                // ARMv6T2 and above
362                0xE6FF0F30 | (rd_bits << 12) | rm_bits
363            }
364
365            ArmOp::Sxtb { rd, rm } => {
366                let rd_bits = reg_to_bits(rd);
367                let rm_bits = reg_to_bits(rm);
368
369                // SXTB encoding: cond(4) | 01101010 | 1111 | Rd(4) | rotate(2) | 00 | 0111 | Rm(4)
370                // ARMv6 and above. rotate=00 for no rotation
371                0xE6AF0070 | (rd_bits << 12) | rm_bits
372            }
373
374            ArmOp::Sxth { rd, rm } => {
375                let rd_bits = reg_to_bits(rd);
376                let rm_bits = reg_to_bits(rm);
377
378                // SXTH encoding: cond(4) | 01101011 | 1111 | Rd(4) | rotate(2) | 00 | 0111 | Rm(4)
379                // ARMv6 and above. rotate=00 for no rotation
380                0xE6BF0070 | (rd_bits << 12) | rm_bits
381            }
382
383            // Move instructions
384            ArmOp::Mov { rd, op2 } => {
385                let rd_bits = reg_to_bits(rd);
386                let (op2_bits, i_flag) = encode_operand2(op2);
387
388                // MOV encoding: opcode=1101
389                0xE1A00000 | (i_flag << 25) | (rd_bits << 12) | op2_bits
390            }
391
392            ArmOp::Mvn { rd, op2 } => {
393                let rd_bits = reg_to_bits(rd);
394                let (op2_bits, i_flag) = encode_operand2(op2);
395
396                // MVN encoding: opcode=1111
397                0xE1E00000 | (i_flag << 25) | (rd_bits << 12) | op2_bits
398            }
399
400            // MOVW - Move Wide (ARM32)
401            // Encoding: cond(4) | 0011 0000 | imm4(4) | Rd(4) | imm12(12)
402            ArmOp::Movw { rd, imm16 } => {
403                let rd_bits = reg_to_bits(rd);
404                let imm4 = ((*imm16 as u32) >> 12) & 0xF;
405                let imm12 = (*imm16 as u32) & 0xFFF;
406                0xE3000000 | (imm4 << 16) | (rd_bits << 12) | imm12
407            }
408
409            // MOVT - Move Top (ARM32)
410            // Encoding: cond(4) | 0011 0100 | imm4(4) | Rd(4) | imm12(12)
411            ArmOp::Movt { rd, imm16 } => {
412                let rd_bits = reg_to_bits(rd);
413                let imm4 = ((*imm16 as u32) >> 12) & 0xF;
414                let imm12 = (*imm16 as u32) & 0xFFF;
415                0xE3400000 | (imm4 << 16) | (rd_bits << 12) | imm12
416            }
417
418            // Compare
419            ArmOp::Cmp { rn, op2 } => {
420                let rn_bits = reg_to_bits(rn);
421                let (op2_bits, i_flag) = encode_operand2(op2);
422
423                // CMP encoding: opcode=1010, S=1
424                0xE1500000 | (i_flag << 25) | (rn_bits << 16) | op2_bits
425            }
426
427            // Compare Negative (CMN) - computes Rn + op2 and sets flags
428            ArmOp::Cmn { rn, op2 } => {
429                let rn_bits = reg_to_bits(rn);
430                let (op2_bits, i_flag) = encode_operand2(op2);
431
432                // CMN encoding: opcode=1011, S=1
433                0xE1700000 | (i_flag << 25) | (rn_bits << 16) | op2_bits
434            }
435
436            // Load/Store
437            ArmOp::Ldr { rd, addr } => {
438                let rd_bits = reg_to_bits(rd);
439                let (base_bits, offset_bits) = encode_mem_addr(addr);
440
441                // LDR encoding: cond(4) | 01 | I(1) | P(1) | U(1) | B(1) | W(1) | L(1) | Rn(4) | Rd(4) | offset(12)
442                // P=1 (pre-indexed), U=1 (add offset), L=1 (load)
443                0xE5900000 | (base_bits << 16) | (rd_bits << 12) | offset_bits
444            }
445
446            ArmOp::Str { rd, addr } => {
447                let rd_bits = reg_to_bits(rd);
448                let (base_bits, offset_bits) = encode_mem_addr(addr);
449
450                // STR encoding: L=0 (store)
451                0xE5800000 | (base_bits << 16) | (rd_bits << 12) | offset_bits
452            }
453
454            // Sub-word loads (ARM32 encoding)
455            ArmOp::Ldrb { rd, addr } => {
456                let rd_bits = reg_to_bits(rd);
457                let (base_bits, offset_bits) = encode_mem_addr(addr);
458                // LDRB: LDR with B=1 (byte): cond|01|I|P|U|1|W|L|Rn|Rd|offset
459                0xE5D00000 | (base_bits << 16) | (rd_bits << 12) | offset_bits
460            }
461
462            ArmOp::Ldrsb { rd, addr } => {
463                let rd_bits = reg_to_bits(rd);
464                let (base_bits, offset_bits) = encode_mem_addr(addr);
465                // LDRSB (misc load): cond|000|P|U|1|W|1|Rn|Rd|imm4H|1101|imm4L
466                // Simplified with immediate offset
467                let offset_val = offset_bits & 0xFF;
468                let imm4h = (offset_val >> 4) & 0xF;
469                let imm4l = offset_val & 0xF;
470                0xE1D000D0 | (base_bits << 16) | (rd_bits << 12) | (imm4h << 8) | imm4l
471            }
472
473            ArmOp::Ldrh { rd, addr } => {
474                let rd_bits = reg_to_bits(rd);
475                let (base_bits, offset_bits) = encode_mem_addr(addr);
476                // LDRH (misc load): cond|000|P|U|1|W|1|Rn|Rd|imm4H|1011|imm4L
477                let offset_val = offset_bits & 0xFF;
478                let imm4h = (offset_val >> 4) & 0xF;
479                let imm4l = offset_val & 0xF;
480                0xE1D000B0 | (base_bits << 16) | (rd_bits << 12) | (imm4h << 8) | imm4l
481            }
482
483            ArmOp::Ldrsh { rd, addr } => {
484                let rd_bits = reg_to_bits(rd);
485                let (base_bits, offset_bits) = encode_mem_addr(addr);
486                // LDRSH (misc load): cond|000|P|U|1|W|1|Rn|Rd|imm4H|1111|imm4L
487                let offset_val = offset_bits & 0xFF;
488                let imm4h = (offset_val >> 4) & 0xF;
489                let imm4l = offset_val & 0xF;
490                0xE1D000F0 | (base_bits << 16) | (rd_bits << 12) | (imm4h << 8) | imm4l
491            }
492
493            // Sub-word stores (ARM32 encoding)
494            ArmOp::Strb { rd, addr } => {
495                let rd_bits = reg_to_bits(rd);
496                let (base_bits, offset_bits) = encode_mem_addr(addr);
497                // STRB: STR with B=1 (byte): cond|01|I|P|U|1|W|0|Rn|Rd|offset
498                0xE5C00000 | (base_bits << 16) | (rd_bits << 12) | offset_bits
499            }
500
501            ArmOp::Strh { rd, addr } => {
502                let rd_bits = reg_to_bits(rd);
503                let (base_bits, offset_bits) = encode_mem_addr(addr);
504                // STRH (misc store): cond|000|P|U|1|W|0|Rn|Rd|imm4H|1011|imm4L
505                let offset_val = offset_bits & 0xFF;
506                let imm4h = (offset_val >> 4) & 0xF;
507                let imm4l = offset_val & 0xF;
508                0xE1C000B0 | (base_bits << 16) | (rd_bits << 12) | (imm4h << 8) | imm4l
509            }
510
511            // Memory management (ARM32 encoding)
512            ArmOp::MemorySize { rd } => {
513                let rd_bits = reg_to_bits(rd);
514                // MOV rd, R10, LSR #16  (memory size in bytes / 65536 = pages)
515                // cond|000|1101|S|0000|Rd|shift5|type|0|Rm
516                // LSR #16: shift5=10000, type=01
517                0xE1A00820 | (rd_bits << 12) | 0x0A // Rm=R10, shift=16, LSR
518            }
519
520            ArmOp::MemoryGrow { rd, .. } => {
521                let rd_bits = reg_to_bits(rd);
522                // On embedded, always fail: MOV rd, #-1
523                0xE3E00000 | (rd_bits << 12) // MVN rd, #0 = MOV rd, #-1
524            }
525
526            // Label pseudo-instruction: emits no machine code
527            ArmOp::Label { .. } => {
528                return Ok(Vec::new());
529            }
530
531            // Branch instructions
532            ArmOp::B { label: _ } => {
533                // B encoding: cond(4) | 1010 | offset(24)
534                // Simplified: branch to offset 0 (will be patched by linker/resolver)
535                0xEA000000
536            }
537
538            // Conditional branch to label (generic)
539            ArmOp::Bcc { cond, label: _ } => {
540                use synth_synthesis::Condition;
541                let cond_bits: u32 = match cond {
542                    Condition::EQ => 0x0,
543                    Condition::NE => 0x1,
544                    Condition::HS => 0x2,
545                    Condition::LO => 0x3,
546                    Condition::HI => 0x8,
547                    Condition::LS => 0x9,
548                    Condition::GE => 0xA,
549                    Condition::LT => 0xB,
550                    Condition::GT => 0xC,
551                    Condition::LE => 0xD,
552                };
553                // B<cond> with offset 0 (will be patched)
554                (cond_bits << 28) | 0x0A000000
555            }
556
557            // BHS (Branch if Higher or Same) - used for bounds checking
558            ArmOp::Bhs { label: _ } => {
559                // BHS encoding: cond(2=HS) | 1010 | offset(24)
560                0x2A000000 // BHS with offset 0
561            }
562
563            // BLO (Branch if Lower) - complementary to BHS
564            ArmOp::Blo { label: _ } => {
565                // BLO encoding: cond(3=LO) | 1010 | offset(24)
566                0x3A000000 // BLO with offset 0
567            }
568
569            // Branch with numeric offset (in instructions)
570            // ARM32 B instruction: offset is in instructions, stored as words
571            // The offset is relative to PC+8 (due to ARM pipeline)
572            ArmOp::BOffset { offset } => {
573                // B encoding: cond(4) | 1010 | offset(24)
574                // Offset is signed, in words (4-byte units)
575                // ARM adds PC+8 to the offset, so we need to adjust:
576                // target = PC + 8 + (offset * 4)
577                // For backward branch of N instructions: offset = -(N + 2)
578                // wrapping_sub keeps the encoder total under fuzzing (#186): an
579                // extreme i32::MIN offset would otherwise overflow-panic; for any
580                // real branch offset this is identical to `- 2`.
581                let adjusted_offset = offset.wrapping_sub(2); // Account for PC+8
582                let offset_bits = (adjusted_offset as u32) & 0x00FFFFFF;
583                0xEA000000 | offset_bits
584            }
585
586            // Conditional branch with numeric offset
587            ArmOp::BCondOffset { cond, offset } => {
588                use synth_synthesis::Condition;
589                let cond_bits: u32 = match cond {
590                    Condition::EQ => 0x0,
591                    Condition::NE => 0x1,
592                    Condition::HS => 0x2,
593                    Condition::LO => 0x3,
594                    Condition::HI => 0x8,
595                    Condition::LS => 0x9,
596                    Condition::GE => 0xA,
597                    Condition::LT => 0xB,
598                    Condition::GT => 0xC,
599                    Condition::LE => 0xD,
600                };
601                // B<cond> encoding: cond(4) | 1010 | offset(24)
602                // wrapping_sub: total under fuzzing (#186), identical for real offsets.
603                let adjusted_offset = offset.wrapping_sub(2); // Account for PC+8
604                let offset_bits = (adjusted_offset as u32) & 0x00FFFFFF;
605                (cond_bits << 28) | 0x0A000000 | offset_bits
606            }
607
608            ArmOp::Bl { label: _ } => {
609                // BL encoding: cond(4) | 1011 | offset(24)
610                0xEB000000
611            }
612
613            ArmOp::Bx { rm } => {
614                let rm_bits = reg_to_bits(rm);
615
616                // BX encoding: cond(4) | 000100101111111111110001 | Rm(4)
617                0xE12FFF10 | rm_bits
618            }
619
620            ArmOp::Blx { rm } => {
621                let rm_bits = reg_to_bits(rm);
622
623                // BLX (register) encoding: cond(4) | 000100101111111111110011 | Rm(4)
624                0xE12FFF30 | rm_bits
625            }
626
627            ArmOp::Push { regs } => {
628                // STMDB SP!, {regs} encoding: cond(4) | 100100 | 10 | 1101 | register_list(16)
629                let mut reg_list: u32 = 0;
630                for r in regs {
631                    reg_list |= 1 << reg_to_bits(r);
632                }
633                0xE92D0000 | reg_list
634            }
635
636            ArmOp::Pop { regs } => {
637                // LDMIA SP!, {regs} encoding: cond(4) | 100010 | 11 | 1101 | register_list(16)
638                let mut reg_list: u32 = 0;
639                for r in regs {
640                    reg_list |= 1 << reg_to_bits(r);
641                }
642                0xE8BD0000 | reg_list
643            }
644
645            ArmOp::Nop => {
646                // NOP encoding: MOV R0, R0
647                0xE1A00000
648            }
649
650            ArmOp::Udf { imm } => {
651                // UDF (Undefined) encoding in ARM: 0xE7F000F0 | (imm12_hi << 8) | imm4_lo
652                // We only use imm8, so split into imm4_hi and imm4_lo
653                let imm8 = *imm as u32;
654                0xE7F000F0 | ((imm8 & 0xF0) << 4) | (imm8 & 0x0F)
655            }
656
657            // Pseudo-instructions for verification - encode as NOP
658            // These are used in formal verification but not actual code generation
659            ArmOp::Popcnt { .. } => {
660                // Population count pseudo-instruction
661                // Not a real ARM instruction, would be expanded to actual code
662                0xE1A00000 // NOP for now
663            }
664
665            ArmOp::SetCond { .. } => {
666                // Condition evaluation pseudo-instruction
667                // Not a real ARM instruction, would be expanded to actual code
668                0xE1A00000 // NOP for now
669            }
670
671            ArmOp::SelectMove { .. } => {
672                // Conditional move pseudo-instruction for ARM32
673                // Would use MOV{cond} instruction
674                0xE1A00000 // NOP for now
675            }
676
677            ArmOp::Select { .. } => {
678                // Select pseudo-instruction
679                // Not a real ARM instruction, would be expanded to conditional moves
680                0xE1A00000 // NOP for now
681            }
682
683            ArmOp::LocalGet { .. } => {
684                // Local variable get pseudo-instruction
685                // Not a real ARM instruction, would be expanded to memory access
686                0xE1A00000 // NOP for now
687            }
688
689            ArmOp::LocalSet { .. } => {
690                // Local variable set pseudo-instruction
691                // Not a real ARM instruction, would be expanded to memory access
692                0xE1A00000 // NOP for now
693            }
694
695            ArmOp::LocalTee { .. } => {
696                // Local variable tee pseudo-instruction
697                // Not a real ARM instruction, would be expanded to memory access
698                0xE1A00000 // NOP for now
699            }
700
701            ArmOp::GlobalGet { .. } => {
702                // Global variable get pseudo-instruction
703                // Not a real ARM instruction, would be expanded to memory access
704                0xE1A00000 // NOP for now
705            }
706
707            ArmOp::GlobalSet { .. } => {
708                // Global variable set pseudo-instruction
709                // Not a real ARM instruction, would be expanded to memory access
710                0xE1A00000 // NOP for now
711            }
712
713            ArmOp::BrTable { .. } => {
714                // Branch table pseudo-instruction
715                // Not a real ARM instruction, would be expanded to jump table
716                0xE1A00000 // NOP for now
717            }
718
719            ArmOp::Call { .. } => {
720                // Function call pseudo-instruction
721                // Not a real ARM instruction, would be expanded to BL
722                0xE1A00000 // NOP for now
723            }
724
725            ArmOp::CallIndirect { .. } => {
726                // Indirect function call pseudo-instruction
727                // Not a real ARM instruction, would be expanded to indirect branch
728                0xE1A00000 // NOP for now
729            }
730
731            // i64 pseudo-instructions (Phase 2) - encode as NOP for now
732            // Real compiler would expand these to multi-instruction sequences
733            ArmOp::I64Add { .. } => 0xE1A00000,        // NOP
734            ArmOp::I64Sub { .. } => 0xE1A00000,        // NOP
735            ArmOp::I64DivS { .. } => 0xE1A00000,       // NOP
736            ArmOp::I64DivU { .. } => 0xE1A00000,       // NOP
737            ArmOp::I64RemS { .. } => 0xE1A00000,       // NOP
738            ArmOp::I64RemU { .. } => 0xE1A00000,       // NOP
739            ArmOp::I64Clz { .. } => 0xE1A00000,        // NOP
740            ArmOp::I64Ctz { .. } => 0xE1A00000,        // NOP
741            ArmOp::I64Popcnt { .. } => 0xE1A00000,     // NOP
742            ArmOp::I64And { .. } => 0xE1A00000,        // NOP
743            ArmOp::I64Or { .. } => 0xE1A00000,         // NOP
744            ArmOp::I64Xor { .. } => 0xE1A00000,        // NOP
745            ArmOp::I64Eqz { .. } => 0xE1A00000,        // NOP
746            ArmOp::I64Eq { .. } => 0xE1A00000,         // NOP
747            ArmOp::I64Ne { .. } => 0xE1A00000,         // NOP
748            ArmOp::I64LtS { .. } => 0xE1A00000,        // NOP
749            ArmOp::I64LtU { .. } => 0xE1A00000,        // NOP
750            ArmOp::I64LeS { .. } => 0xE1A00000,        // NOP
751            ArmOp::I64LeU { .. } => 0xE1A00000,        // NOP
752            ArmOp::I64GtS { .. } => 0xE1A00000,        // NOP
753            ArmOp::I64GtU { .. } => 0xE1A00000,        // NOP
754            ArmOp::I64GeS { .. } => 0xE1A00000,        // NOP
755            ArmOp::I64GeU { .. } => 0xE1A00000,        // NOP
756            ArmOp::I64Const { .. } => 0xE1A00000,      // NOP
757            ArmOp::I64Ldr { .. } => 0xE1A00000,        // NOP
758            ArmOp::I64Str { .. } => 0xE1A00000,        // NOP
759            ArmOp::I64ExtendI32S { .. } => 0xE1A00000, // NOP
760            ArmOp::I64ExtendI32U { .. } => 0xE1A00000, // NOP
761            ArmOp::I64Extend8S { .. } => 0xE1A00000,   // NOP (Thumb-2 only)
762            ArmOp::I64Extend16S { .. } => 0xE1A00000,  // NOP (Thumb-2 only)
763            ArmOp::I64Extend32S { .. } => 0xE1A00000,  // NOP (Thumb-2 only)
764            ArmOp::I32WrapI64 { .. } => 0xE1A00000,    // NOP
765
766            // f32 VFP single-precision instructions
767            ArmOp::F32Add { sd, sn, sm } => encode_vfp_3reg(0xEE300A00, sd, sn, sm)?,
768            ArmOp::F32Sub { sd, sn, sm } => encode_vfp_3reg(0xEE300A40, sd, sn, sm)?,
769            ArmOp::F32Mul { sd, sn, sm } => encode_vfp_3reg(0xEE200A00, sd, sn, sm)?,
770            ArmOp::F32Div { sd, sn, sm } => encode_vfp_3reg(0xEE800A00, sd, sn, sm)?,
771            ArmOp::F32Abs { sd, sm } => encode_vfp_2reg(0xEEB00AC0, sd, sm)?,
772            ArmOp::F32Neg { sd, sm } => encode_vfp_2reg(0xEEB10A40, sd, sm)?,
773            ArmOp::F32Sqrt { sd, sm } => encode_vfp_2reg(0xEEB10AC0, sd, sm)?,
774
775            // f32 pseudo-ops — multi-instruction sequences
776            // FPSCR RMode: 00=nearest, 01=+inf(ceil), 10=-inf(floor), 11=zero(trunc)
777            ArmOp::F32Ceil { sd, sm } => {
778                return self.encode_arm_f32_rounding(sd, sm, 0b01); // Round toward +Inf
779            }
780            ArmOp::F32Floor { sd, sm } => {
781                return self.encode_arm_f32_rounding(sd, sm, 0b10); // Round toward -Inf
782            }
783            ArmOp::F32Trunc { sd, sm } => {
784                return self.encode_arm_f32_rounding(sd, sm, 0b11); // VCVT toward zero
785            }
786            ArmOp::F32Nearest { sd, sm } => {
787                return self.encode_arm_f32_rounding(sd, sm, 0b00); // VCVT to nearest
788            }
789            ArmOp::F32Min { sd, sn, sm } => {
790                return self.encode_arm_f32_minmax(sd, sn, sm, true);
791            }
792            ArmOp::F32Max { sd, sn, sm } => {
793                return self.encode_arm_f32_minmax(sd, sn, sm, false);
794            }
795            ArmOp::F32Copysign { sd, sn, sm } => {
796                return self.encode_arm_f32_copysign(sd, sn, sm);
797            }
798
799            // f32 comparisons — multi-instruction: VCMP + VMRS + conditional MOV
800            ArmOp::F32Eq { rd, sn, sm } => {
801                return self.encode_arm_f32_compare(rd, sn, sm, 0x0); // EQ
802            }
803            ArmOp::F32Ne { rd, sn, sm } => {
804                return self.encode_arm_f32_compare(rd, sn, sm, 0x1); // NE
805            }
806            ArmOp::F32Lt { rd, sn, sm } => {
807                return self.encode_arm_f32_compare(rd, sn, sm, 0x4); // MI (less than)
808            }
809            ArmOp::F32Le { rd, sn, sm } => {
810                return self.encode_arm_f32_compare(rd, sn, sm, 0x9); // LS (less or same)
811            }
812            ArmOp::F32Gt { rd, sn, sm } => {
813                return self.encode_arm_f32_compare(rd, sn, sm, 0xC); // GT
814            }
815            ArmOp::F32Ge { rd, sn, sm } => {
816                return self.encode_arm_f32_compare(rd, sn, sm, 0xA); // GE
817            }
818
819            // f32 const — multi-instruction: MOVW + MOVT + VMOV
820            ArmOp::F32Const { sd, value } => {
821                return self.encode_arm_f32_const(sd, *value);
822            }
823
824            ArmOp::F32Load { sd, addr } => encode_vfp_ldst(0xED900A00, sd, addr)?,
825            ArmOp::F32Store { sd, addr } => encode_vfp_ldst(0xED800A00, sd, addr)?,
826
827            // f32 conversions — multi-instruction sequences
828            ArmOp::F32ConvertI32S { sd, rm } => {
829                return self.encode_arm_f32_convert_i32(sd, rm, true);
830            }
831            ArmOp::F32ConvertI32U { sd, rm } => {
832                return self.encode_arm_f32_convert_i32(sd, rm, false);
833            }
834            ArmOp::F32ConvertI64S { .. } | ArmOp::F32ConvertI64U { .. } => {
835                return Err(synth_core::Error::synthesis(
836                    "F32 i64 conversion not supported (requires register pairs on 32-bit ARM)",
837                ));
838            }
839            ArmOp::F32ReinterpretI32 { sd, rm } => encode_vmov_core_sreg(true, sd, rm)?,
840            ArmOp::I32ReinterpretF32 { rd, sm } => encode_vmov_core_sreg(false, sm, rd)?,
841            ArmOp::I32TruncF32S { rd, sm } => {
842                return self.encode_arm_i32_trunc_f32(rd, sm, true);
843            }
844            ArmOp::I32TruncF32U { rd, sm } => {
845                return self.encode_arm_i32_trunc_f32(rd, sm, false);
846            }
847
848            // f64 VFP double-precision instructions (ARM32)
849            // F64 arithmetic: same as F32 but with sz=1 (bit 8 = 1, cp11 = 0xB)
850            ArmOp::F64Add { dd, dn, dm } => encode_vfp_3reg_f64(0xEE300B00, dd, dn, dm)?,
851            ArmOp::F64Sub { dd, dn, dm } => encode_vfp_3reg_f64(0xEE300B40, dd, dn, dm)?,
852            ArmOp::F64Mul { dd, dn, dm } => encode_vfp_3reg_f64(0xEE200B00, dd, dn, dm)?,
853            ArmOp::F64Div { dd, dn, dm } => encode_vfp_3reg_f64(0xEE800B00, dd, dn, dm)?,
854            ArmOp::F64Abs { dd, dm } => encode_vfp_2reg_f64(0xEEB00BC0, dd, dm)?,
855            ArmOp::F64Neg { dd, dm } => encode_vfp_2reg_f64(0xEEB10B40, dd, dm)?,
856            ArmOp::F64Sqrt { dd, dm } => encode_vfp_2reg_f64(0xEEB10BC0, dd, dm)?,
857
858            // f64 pseudo-ops
859            // FPSCR RMode: 00=nearest, 01=+inf(ceil), 10=-inf(floor), 11=zero(trunc)
860            ArmOp::F64Ceil { dd, dm } => {
861                return self.encode_arm_f64_rounding(dd, dm, 0b01);
862            }
863            ArmOp::F64Floor { dd, dm } => {
864                return self.encode_arm_f64_rounding(dd, dm, 0b10);
865            }
866            ArmOp::F64Trunc { dd, dm } => {
867                return self.encode_arm_f64_rounding(dd, dm, 0b11);
868            }
869            ArmOp::F64Nearest { dd, dm } => {
870                return self.encode_arm_f64_rounding(dd, dm, 0b00);
871            }
872            ArmOp::F64Min { dd, dn, dm } => {
873                return self.encode_arm_f64_minmax(dd, dn, dm, true);
874            }
875            ArmOp::F64Max { dd, dn, dm } => {
876                return self.encode_arm_f64_minmax(dd, dn, dm, false);
877            }
878            ArmOp::F64Copysign { dd, dn, dm } => {
879                return self.encode_arm_f64_copysign(dd, dn, dm);
880            }
881
882            // f64 comparisons
883            ArmOp::F64Eq { rd, dn, dm } => {
884                return self.encode_arm_f64_compare(rd, dn, dm, 0x0);
885            }
886            ArmOp::F64Ne { rd, dn, dm } => {
887                return self.encode_arm_f64_compare(rd, dn, dm, 0x1);
888            }
889            ArmOp::F64Lt { rd, dn, dm } => {
890                return self.encode_arm_f64_compare(rd, dn, dm, 0x4);
891            }
892            ArmOp::F64Le { rd, dn, dm } => {
893                return self.encode_arm_f64_compare(rd, dn, dm, 0x9);
894            }
895            ArmOp::F64Gt { rd, dn, dm } => {
896                return self.encode_arm_f64_compare(rd, dn, dm, 0xC);
897            }
898            ArmOp::F64Ge { rd, dn, dm } => {
899                return self.encode_arm_f64_compare(rd, dn, dm, 0xA);
900            }
901
902            ArmOp::F64Const { dd, value } => {
903                return self.encode_arm_f64_const(dd, *value);
904            }
905
906            ArmOp::F64Load { dd, addr } => encode_vfp_ldst_f64(0xED900B00, dd, addr)?,
907            ArmOp::F64Store { dd, addr } => encode_vfp_ldst_f64(0xED800B00, dd, addr)?,
908
909            ArmOp::F64ConvertI32S { dd, rm } => {
910                return self.encode_arm_f64_convert_i32(dd, rm, true);
911            }
912            ArmOp::F64ConvertI32U { dd, rm } => {
913                return self.encode_arm_f64_convert_i32(dd, rm, false);
914            }
915            ArmOp::F64ConvertI64S { .. } | ArmOp::F64ConvertI64U { .. } => {
916                return Err(synth_core::Error::synthesis(
917                    "F64 i64 conversion not supported (requires register pairs on 32-bit ARM)",
918                ));
919            }
920            ArmOp::F64PromoteF32 { dd, sm } => {
921                return self.encode_arm_f64_promote_f32(dd, sm);
922            }
923            ArmOp::F64ReinterpretI64 { dd, rmlo, rmhi } => {
924                encode_vmov_core_dreg(true, dd, rmlo, rmhi)?
925            }
926            ArmOp::I64ReinterpretF64 { rdlo, rdhi, dm } => {
927                encode_vmov_core_dreg(false, dm, rdlo, rdhi)?
928            }
929            ArmOp::I64TruncF64S { .. } | ArmOp::I64TruncF64U { .. } => {
930                return Err(synth_core::Error::synthesis(
931                    "i64 truncation from F64 not supported (requires i64 register pairs on 32-bit ARM)",
932                ));
933            }
934            ArmOp::I32TruncF64S { rd, dm } => {
935                return self.encode_arm_i32_trunc_f64(rd, dm, true);
936            }
937            ArmOp::I32TruncF64U { rd, dm } => {
938                return self.encode_arm_i32_trunc_f64(rd, dm, false);
939            }
940            // Multi-instruction sequences - only meaningful in Thumb-2 mode
941            ArmOp::I64SetCond { .. }
942            | ArmOp::I64SetCondZ { .. }
943            | ArmOp::I64Mul { .. }
944            | ArmOp::I64Shl { .. }
945            | ArmOp::I64ShrS { .. }
946            | ArmOp::I64ShrU { .. }
947            | ArmOp::I64Rotl { .. }
948            | ArmOp::I64Rotr { .. } => 0xE1A00000, // NOP (Thumb-2 only)
949
950            // MVE instructions — Thumb-2 only (Cortex-M55 is always Thumb-2)
951            ArmOp::MveLoad { .. }
952            | ArmOp::MveStore { .. }
953            | ArmOp::MveConst { .. }
954            | ArmOp::MveAnd { .. }
955            | ArmOp::MveOrr { .. }
956            | ArmOp::MveEor { .. }
957            | ArmOp::MveMvn { .. }
958            | ArmOp::MveBic { .. }
959            | ArmOp::MveAddI { .. }
960            | ArmOp::MveSubI { .. }
961            | ArmOp::MveMulI { .. }
962            | ArmOp::MveNegI { .. }
963            | ArmOp::MveCmpEqI { .. }
964            | ArmOp::MveCmpNeI { .. }
965            | ArmOp::MveCmpLtS { .. }
966            | ArmOp::MveCmpLtU { .. }
967            | ArmOp::MveCmpGtS { .. }
968            | ArmOp::MveCmpGtU { .. }
969            | ArmOp::MveCmpLeS { .. }
970            | ArmOp::MveCmpLeU { .. }
971            | ArmOp::MveCmpGeS { .. }
972            | ArmOp::MveCmpGeU { .. }
973            | ArmOp::MveDup { .. }
974            | ArmOp::MveExtractLane { .. }
975            | ArmOp::MveInsertLane { .. }
976            | ArmOp::MveAddF32 { .. }
977            | ArmOp::MveSubF32 { .. }
978            | ArmOp::MveMulF32 { .. }
979            | ArmOp::MveNegF32 { .. }
980            | ArmOp::MveAbsF32 { .. }
981            | ArmOp::MveCmpEqF32 { .. }
982            | ArmOp::MveCmpNeF32 { .. }
983            | ArmOp::MveCmpLtF32 { .. }
984            | ArmOp::MveCmpLeF32 { .. }
985            | ArmOp::MveCmpGtF32 { .. }
986            | ArmOp::MveCmpGeF32 { .. }
987            | ArmOp::MveDupF32 { .. }
988            | ArmOp::MveExtractLaneF32 { .. }
989            | ArmOp::MveReplaceLaneF32 { .. }
990            | ArmOp::MveDivF32 { .. }
991            | ArmOp::MveSqrtF32 { .. } => 0xE1A00000, // NOP (MVE = Thumb-2 only)
992        };
993
994        // ARM32 instructions are little-endian
995        Ok(instr.to_le_bytes().to_vec())
996    }
997
998    // === ARM32 VFP multi-instruction helpers ===
999
1000    /// Encode F32 comparison as ARM32: VCMP.F32 + VMRS + MOV rd,#0 + MOVcond rd,#1
1001    fn encode_arm_f32_compare(
1002        &self,
1003        rd: &Reg,
1004        sn: &VfpReg,
1005        sm: &VfpReg,
1006        cond_code: u32,
1007    ) -> Result<Vec<u8>> {
1008        let mut bytes = Vec::new();
1009
1010        // VCMP.F32 Sn, Sm: 0xEEB40A40 with Sn in Vd position, Sm in Vm position
1011        let sn_num = vfp_sreg_to_num(sn)?;
1012        let sm_num = vfp_sreg_to_num(sm)?;
1013        let (vd, d) = encode_sreg(sn_num);
1014        let (vm, m) = encode_sreg(sm_num);
1015        let vcmp = 0xEEB40A40 | (d << 22) | (vd << 12) | (m << 5) | vm;
1016        bytes.extend_from_slice(&vcmp.to_le_bytes());
1017
1018        // VMRS APSR_nzcv, FPSCR: 0xEEF1FA10
1019        bytes.extend_from_slice(&0xEEF1FA10u32.to_le_bytes());
1020
1021        // MOV rd, #0: 0xE3A0_0000 | (rd << 12)
1022        let rd_bits = reg_to_bits(rd);
1023        let mov_zero = 0xE3A00000 | (rd_bits << 12);
1024        bytes.extend_from_slice(&mov_zero.to_le_bytes());
1025
1026        // MOVcond rd, #1: cond(4) | 0011 1010 0000 rd(4) 0000 0000 0001
1027        let mov_one = (cond_code << 28) | 0x03A00001 | (rd_bits << 12);
1028        bytes.extend_from_slice(&mov_one.to_le_bytes());
1029
1030        Ok(bytes)
1031    }
1032
1033    /// Encode F32 constant load as ARM32: MOVW Rt,#lo16 + MOVT Rt,#hi16 + VMOV Sd,Rt
1034    fn encode_arm_f32_const(&self, sd: &VfpReg, value: f32) -> Result<Vec<u8>> {
1035        let mut bytes = Vec::new();
1036        let bits = value.to_bits();
1037
1038        // Use R12 as temp register for constant loading
1039        let rt: u32 = 12; // R12/IP
1040
1041        // MOVW R12, #lo16: 0xE300_C000 | (imm4 << 16) | imm12
1042        let lo16 = bits & 0xFFFF;
1043        let movw = 0xE3000000 | (rt << 12) | ((lo16 >> 12) << 16) | (lo16 & 0xFFF);
1044        bytes.extend_from_slice(&movw.to_le_bytes());
1045
1046        // MOVT R12, #hi16: 0xE340_C000 | (imm4 << 16) | imm12
1047        let hi16 = (bits >> 16) & 0xFFFF;
1048        let movt = 0xE3400000 | (rt << 12) | ((hi16 >> 12) << 16) | (hi16 & 0xFFF);
1049        bytes.extend_from_slice(&movt.to_le_bytes());
1050
1051        // VMOV Sd, R12
1052        let vmov = encode_vmov_core_sreg(true, sd, &Reg::R12)?;
1053        bytes.extend_from_slice(&vmov.to_le_bytes());
1054
1055        Ok(bytes)
1056    }
1057
1058    /// Encode VMOV + VCVT.F32.S32/U32 as ARM32
1059    fn encode_arm_f32_convert_i32(&self, sd: &VfpReg, rm: &Reg, signed: bool) -> Result<Vec<u8>> {
1060        let mut bytes = Vec::new();
1061
1062        // VMOV Sd, Rm — move integer to VFP register
1063        let vmov = encode_vmov_core_sreg(true, sd, rm)?;
1064        bytes.extend_from_slice(&vmov.to_le_bytes());
1065
1066        // VCVT.F32.S32 Sd, Sd (signed) or VCVT.F32.U32 Sd, Sd (unsigned)
1067        // Base: 0xEEB80A40 (signed) or 0xEEB80AC0 (unsigned)
1068        let sd_num = vfp_sreg_to_num(sd)?;
1069        let (vd, d) = encode_sreg(sd_num);
1070        let (vm, m) = encode_sreg(sd_num); // same register as source
1071        let base = if signed { 0xEEB80A40 } else { 0xEEB80AC0 };
1072        let vcvt = base | (d << 22) | (vd << 12) | (m << 5) | vm;
1073        bytes.extend_from_slice(&vcvt.to_le_bytes());
1074
1075        Ok(bytes)
1076    }
1077
1078    /// Encode F32 rounding pseudo-op as ARM32 via VCVT to integer and back.
1079    /// mode: 0b00=nearest, 0b01=floor(-Inf), 0b10=ceil(+Inf), 0b11=trunc(zero)
1080    /// Strategy: VCVT.S32.F32 Sd, Sm (toward zero), then VCVT.F32.S32 Sd, Sd
1081    /// For ceil/floor/nearest, we use VCVTR (round toward mode) + convert back.
1082    /// Simplified: convert to int (toward zero for trunc) then back to float.
1083    /// Encode F32 rounding as ARM32.
1084    /// `mode`: FPSCR RMode — 0b00=nearest, 0b01=+inf(ceil), 0b10=-inf(floor), 0b11=zero(trunc)
1085    ///
1086    /// For trunc (mode=0b11): uses VCVTR.S32.F32 (always rounds toward zero).
1087    /// For ceil/floor/nearest: sets FPSCR rounding mode, uses VCVT.S32.F32 (non-R variant
1088    /// which honours FPSCR rmode), then restores FPSCR.
1089    fn encode_arm_f32_rounding(&self, sd: &VfpReg, sm: &VfpReg, mode: u8) -> Result<Vec<u8>> {
1090        let mut bytes = Vec::new();
1091        let sm_num = vfp_sreg_to_num(sm)?;
1092        let sd_num = vfp_sreg_to_num(sd)?;
1093        let (vd_s, d_s) = encode_sreg(sd_num);
1094        let (vm_s, m_s) = encode_sreg(sm_num);
1095
1096        if mode == 0b11 {
1097            // Trunc (toward zero): VCVTR.S32.F32 — the "R" variant always truncates.
1098            // 0xEEBD0AC0: bit[7]=1 => round toward zero regardless of FPSCR
1099            let vcvt_to_int = 0xEEBD0AC0 | (d_s << 22) | (vd_s << 12) | (m_s << 5) | vm_s;
1100            bytes.extend_from_slice(&vcvt_to_int.to_le_bytes());
1101        } else {
1102            // ceil/floor/nearest: manipulate FPSCR rounding mode
1103            let rt: u32 = 12; // R12/IP as temp
1104
1105            // VMRS R12, FPSCR
1106            let vmrs = 0xEEF10A10 | (rt << 12);
1107            bytes.extend_from_slice(&vmrs.to_le_bytes());
1108
1109            // BIC R12, R12, #(3 << 22) — clear RMode bits [23:22]
1110            // 3<<22 = 0x00C00000. ARM rotated imm: 0x03 ror 10 (rotation=5, imm8=0x03)
1111            let bic = 0xE3CC0000 | (rt << 12) | (0x05 << 8) | 0x03;
1112            bytes.extend_from_slice(&bic.to_le_bytes());
1113
1114            // ORR R12, R12, #(mode << 22) — set desired rounding mode
1115            if mode != 0 {
1116                // mode<<22: rotation=5, imm8=mode
1117                let orr = 0xE38C0000 | (rt << 12) | (0x05 << 8) | (mode as u32);
1118                bytes.extend_from_slice(&orr.to_le_bytes());
1119            }
1120
1121            // VMSR FPSCR, R12
1122            let vmsr = 0xEEE10A10 | (rt << 12);
1123            bytes.extend_from_slice(&vmsr.to_le_bytes());
1124
1125            // VCVT.S32.F32 Sd, Sm — non-R variant (bit[7]=0), uses FPSCR rounding mode
1126            let vcvt_to_int = 0xEEBD0A40 | (d_s << 22) | (vd_s << 12) | (m_s << 5) | vm_s;
1127            bytes.extend_from_slice(&vcvt_to_int.to_le_bytes());
1128
1129            // Restore FPSCR: clear rmode bits back to nearest (default)
1130            bytes.extend_from_slice(&vmrs.to_le_bytes());
1131            bytes.extend_from_slice(&bic.to_le_bytes());
1132            bytes.extend_from_slice(&vmsr.to_le_bytes());
1133        }
1134
1135        // VCVT.F32.S32 Sd, Sd (convert integer result back to float)
1136        let (vd2, d2) = encode_sreg(sd_num);
1137        let vcvt_to_float = 0xEEB80A40 | (d2 << 22) | (vd2 << 12) | (d_s << 5) | vd_s;
1138        bytes.extend_from_slice(&vcvt_to_float.to_le_bytes());
1139
1140        Ok(bytes)
1141    }
1142
1143    /// Encode F32 min/max as ARM32: VCMP + VMRS + conditional VMOV
1144    fn encode_arm_f32_minmax(
1145        &self,
1146        sd: &VfpReg,
1147        sn: &VfpReg,
1148        sm: &VfpReg,
1149        is_min: bool,
1150    ) -> Result<Vec<u8>> {
1151        let mut bytes = Vec::new();
1152        let sn_num = vfp_sreg_to_num(sn)?;
1153        let sm_num = vfp_sreg_to_num(sm)?;
1154        let sd_num = vfp_sreg_to_num(sd)?;
1155
1156        // VMOV Sd, Sn (start with first operand)
1157        let (vd, d) = encode_sreg(sd_num);
1158        let (vn, n) = encode_sreg(sn_num);
1159        let vmov_sn = 0xEEB00A40 | (d << 22) | (vd << 12) | (n << 5) | vn;
1160        bytes.extend_from_slice(&vmov_sn.to_le_bytes());
1161
1162        // VCMP.F32 Sn, Sm
1163        let (vm, m) = encode_sreg(sm_num);
1164        let vcmp = 0xEEB40A40 | (n << 22) | (vn << 12) | (m << 5) | vm;
1165        bytes.extend_from_slice(&vcmp.to_le_bytes());
1166
1167        // VMRS APSR_nzcv, FPSCR
1168        bytes.extend_from_slice(&0xEEF1FA10u32.to_le_bytes());
1169
1170        // For min: if Sn > Sm (GT), use Sm. Condition = GT (0xC)
1171        // For max: if Sn < Sm (MI/LT), use Sm. Condition = MI (0x4)
1172        let cond = if is_min { 0xCu32 } else { 0x4u32 };
1173
1174        // VMOV{cond} Sd, Sm — conditional VMOV
1175        let vmov_cond = (cond << 28) | 0x0EB00A40 | (d << 22) | (vd << 12) | (m << 5) | vm;
1176        bytes.extend_from_slice(&vmov_cond.to_le_bytes());
1177
1178        Ok(bytes)
1179    }
1180
1181    /// Encode F32 copysign as ARM32: extract sign from Sm, magnitude from Sn
1182    fn encode_arm_f32_copysign(&self, sd: &VfpReg, sn: &VfpReg, sm: &VfpReg) -> Result<Vec<u8>> {
1183        let mut bytes = Vec::new();
1184
1185        // VMOV R12, Sm (get sign source bits)
1186        let vmov_sm = encode_vmov_core_sreg(false, sm, &Reg::R12)?;
1187        bytes.extend_from_slice(&vmov_sm.to_le_bytes());
1188
1189        // VMOV R0, Sn (get magnitude source bits) — use R0 as temp
1190        let vmov_sn = encode_vmov_core_sreg(false, sn, &Reg::R0)?;
1191        bytes.extend_from_slice(&vmov_sn.to_le_bytes());
1192
1193        // AND R12, R12, #0x80000000 (keep only sign bit)
1194        // Thumb-2 constant 0x80000000 needs special encoding; in ARM32 use rotated imm
1195        // 0x80000000 = 0x02 rotated right by 2 (rotation=1, imm8=0x02)
1196        let and_sign = 0xE2000000u32 | (12 << 16) | (12 << 12) | (1 << 8) | 0x02;
1197        bytes.extend_from_slice(&and_sign.to_le_bytes());
1198
1199        // BIC R0, R0, #0x80000000 (clear sign bit from magnitude)
1200        // R0 = register 0, so Rn and Rd fields are 0
1201        let bic_sign = 0xE3C00000u32 | (1 << 8) | 0x02;
1202        bytes.extend_from_slice(&bic_sign.to_le_bytes());
1203
1204        // ORR R0, R0, R12 (combine sign + magnitude)
1205        // R0 = register 0, so Rn and Rd fields are 0
1206        let orr = 0xE1800000u32 | 12;
1207        bytes.extend_from_slice(&orr.to_le_bytes());
1208
1209        // VMOV Sd, R0
1210        let vmov_result = encode_vmov_core_sreg(true, sd, &Reg::R0)?;
1211        bytes.extend_from_slice(&vmov_result.to_le_bytes());
1212
1213        Ok(bytes)
1214    }
1215
1216    /// Encode F64 comparison as ARM32: VCMP.F64 + VMRS + MOV rd,#0 + MOVcond rd,#1
1217    fn encode_arm_f64_compare(
1218        &self,
1219        rd: &Reg,
1220        dn: &VfpReg,
1221        dm: &VfpReg,
1222        cond_code: u32,
1223    ) -> Result<Vec<u8>> {
1224        let mut bytes = Vec::new();
1225
1226        // VCMP.F64 Dn, Dm: 0xEEB40B40 with Dn in Vd position, Dm in Vm position
1227        let dn_num = vfp_dreg_to_num(dn)?;
1228        let dm_num = vfp_dreg_to_num(dm)?;
1229        let (vd, d) = encode_dreg(dn_num);
1230        let (vm, m) = encode_dreg(dm_num);
1231        let vcmp = 0xEEB40B40 | (d << 22) | (vd << 12) | (m << 5) | vm;
1232        bytes.extend_from_slice(&vcmp.to_le_bytes());
1233
1234        // VMRS APSR_nzcv, FPSCR
1235        bytes.extend_from_slice(&0xEEF1FA10u32.to_le_bytes());
1236
1237        // MOV rd, #0
1238        let rd_bits = reg_to_bits(rd);
1239        let mov_zero = 0xE3A00000 | (rd_bits << 12);
1240        bytes.extend_from_slice(&mov_zero.to_le_bytes());
1241
1242        // MOVcond rd, #1
1243        let mov_one = (cond_code << 28) | 0x03A00001 | (rd_bits << 12);
1244        bytes.extend_from_slice(&mov_one.to_le_bytes());
1245
1246        Ok(bytes)
1247    }
1248
1249    /// Encode F64 constant load as ARM32: MOVW + MOVT + MOVW + MOVT + VMOV
1250    fn encode_arm_f64_const(&self, dd: &VfpReg, value: f64) -> Result<Vec<u8>> {
1251        let mut bytes = Vec::new();
1252        let bits = value.to_bits();
1253        let lo32 = bits as u32;
1254        let hi32 = (bits >> 32) as u32;
1255
1256        // Load low 32 bits into R0 (Rd field = 0 for R0)
1257        let lo16 = lo32 & 0xFFFF;
1258        let movw_r0 = 0xE3000000 | ((lo16 >> 12) << 16) | (lo16 & 0xFFF);
1259        bytes.extend_from_slice(&movw_r0.to_le_bytes());
1260        let hi16 = (lo32 >> 16) & 0xFFFF;
1261        let movt_r0 = 0xE3400000 | ((hi16 >> 12) << 16) | (hi16 & 0xFFF);
1262        bytes.extend_from_slice(&movt_r0.to_le_bytes());
1263
1264        // Load high 32 bits into R12
1265        let lo16 = hi32 & 0xFFFF;
1266        let movw_r12 = 0xE3000000 | ((lo16 >> 12) << 16) | (12 << 12) | (lo16 & 0xFFF);
1267        bytes.extend_from_slice(&movw_r12.to_le_bytes());
1268        let hi16 = (hi32 >> 16) & 0xFFFF;
1269        let movt_r12 = 0xE3400000 | ((hi16 >> 12) << 16) | (12 << 12) | (hi16 & 0xFFF);
1270        bytes.extend_from_slice(&movt_r12.to_le_bytes());
1271
1272        // VMOV Dd, R0, R12
1273        let vmov = encode_vmov_core_dreg(true, dd, &Reg::R0, &Reg::R12)?;
1274        bytes.extend_from_slice(&vmov.to_le_bytes());
1275
1276        Ok(bytes)
1277    }
1278
1279    /// Encode VMOV Sd, Rm + VCVT.F64.S32/U32 Dd, Sd as ARM32
1280    fn encode_arm_f64_convert_i32(&self, dd: &VfpReg, rm: &Reg, signed: bool) -> Result<Vec<u8>> {
1281        let mut bytes = Vec::new();
1282
1283        // Use S0 as intermediate: VMOV S0, Rm
1284        let vmov = encode_vmov_core_sreg(true, &VfpReg::S0, rm)?;
1285        bytes.extend_from_slice(&vmov.to_le_bytes());
1286
1287        // VCVT.F64.S32 Dd, S0 (signed) or VCVT.F64.U32 Dd, S0 (unsigned)
1288        // Base: 0xEEB80B40 (signed) or 0xEEB80BC0 (unsigned)
1289        let dd_num = vfp_dreg_to_num(dd)?;
1290        let (vd, d) = encode_dreg(dd_num);
1291        let base = if signed { 0xEEB80B40 } else { 0xEEB80BC0 };
1292        // S0 is register 0: Vm=0, M=0
1293        let vcvt = base | (d << 22) | (vd << 12);
1294        bytes.extend_from_slice(&vcvt.to_le_bytes());
1295
1296        Ok(bytes)
1297    }
1298
1299    /// Encode VCVT.F64.F32 Dd, Sm as ARM32 (f32 to f64 promotion)
1300    fn encode_arm_f64_promote_f32(&self, dd: &VfpReg, sm: &VfpReg) -> Result<Vec<u8>> {
1301        let dd_num = vfp_dreg_to_num(dd)?;
1302        let sm_num = vfp_sreg_to_num(sm)?;
1303        let (vd, d) = encode_dreg(dd_num);
1304        let (vm, m) = encode_sreg(sm_num);
1305
1306        // VCVT.F64.F32 Dd, Sm: 0xEEB70AC0
1307        let vcvt = 0xEEB70AC0 | (d << 22) | (vd << 12) | (m << 5) | vm;
1308        Ok(vcvt.to_le_bytes().to_vec())
1309    }
1310
1311    /// Encode VCVT.S32/U32.F64 Sd, Dm + VMOV Rd, Sd as ARM32
1312    fn encode_arm_i32_trunc_f64(&self, rd: &Reg, dm: &VfpReg, signed: bool) -> Result<Vec<u8>> {
1313        let mut bytes = Vec::new();
1314        let dm_num = vfp_dreg_to_num(dm)?;
1315        let (vm, m) = encode_dreg(dm_num);
1316
1317        // VCVT.S32.F64 S0, Dm (toward zero) or VCVT.U32.F64 S0, Dm
1318        // S0: Vd=0, D=0
1319        let base = if signed { 0xEEBD0BC0 } else { 0xEEBC0BC0 };
1320        let vcvt = base | (m << 5) | vm;
1321        bytes.extend_from_slice(&vcvt.to_le_bytes());
1322
1323        // VMOV Rd, S0
1324        let vmov = encode_vmov_core_sreg(false, &VfpReg::S0, rd)?;
1325        bytes.extend_from_slice(&vmov.to_le_bytes());
1326
1327        Ok(bytes)
1328    }
1329
1330    /// Encode F64 rounding pseudo-op as ARM32 via VCVT to integer and back.
1331    /// Encode F64 rounding as ARM32.
1332    /// `mode`: FPSCR RMode — 0b00=nearest, 0b01=+inf(ceil), 0b10=-inf(floor), 0b11=zero(trunc)
1333    ///
1334    /// For trunc: uses VCVTR.S32.F64 (always truncates).
1335    /// For ceil/floor/nearest: sets FPSCR rounding mode, uses VCVT.S32.F64 (non-R variant),
1336    /// then restores FPSCR.
1337    fn encode_arm_f64_rounding(&self, dd: &VfpReg, dm: &VfpReg, mode: u8) -> Result<Vec<u8>> {
1338        let mut bytes = Vec::new();
1339        let dm_num = vfp_dreg_to_num(dm)?;
1340        let dd_num = vfp_dreg_to_num(dd)?;
1341        let (vm, m) = encode_dreg(dm_num);
1342        let (vd, d) = encode_dreg(dd_num);
1343
1344        if mode == 0b11 {
1345            // Trunc (toward zero): VCVTR.S32.F64 — bit[7]=1, always truncates
1346            let vcvt_to_int = 0xEEBD0BC0 | (m << 5) | vm;
1347            bytes.extend_from_slice(&vcvt_to_int.to_le_bytes());
1348        } else {
1349            // ceil/floor/nearest: manipulate FPSCR rounding mode
1350            let rt: u32 = 12;
1351
1352            // VMRS R12, FPSCR
1353            let vmrs = 0xEEF10A10 | (rt << 12);
1354            bytes.extend_from_slice(&vmrs.to_le_bytes());
1355
1356            // BIC R12, R12, #(3 << 22)
1357            let bic = 0xE3CC0000 | (rt << 12) | (0x05 << 8) | 0x03;
1358            bytes.extend_from_slice(&bic.to_le_bytes());
1359
1360            // ORR R12, R12, #(mode << 22)
1361            if mode != 0 {
1362                let orr = 0xE38C0000 | (rt << 12) | (0x05 << 8) | (mode as u32);
1363                bytes.extend_from_slice(&orr.to_le_bytes());
1364            }
1365
1366            // VMSR FPSCR, R12
1367            let vmsr = 0xEEE10A10 | (rt << 12);
1368            bytes.extend_from_slice(&vmsr.to_le_bytes());
1369
1370            // VCVT.S32.F64 S0, Dm — non-R variant (bit[7]=0), uses FPSCR rmode
1371            let vcvt_to_int = 0xEEBD0B40 | (m << 5) | vm;
1372            bytes.extend_from_slice(&vcvt_to_int.to_le_bytes());
1373
1374            // Restore FPSCR
1375            bytes.extend_from_slice(&vmrs.to_le_bytes());
1376            bytes.extend_from_slice(&bic.to_le_bytes());
1377            bytes.extend_from_slice(&vmsr.to_le_bytes());
1378        }
1379
1380        // VCVT.F64.S32 Dd, S0 (convert back to double)
1381        let vcvt_to_float = 0xEEB80B40 | (d << 22) | (vd << 12);
1382        bytes.extend_from_slice(&vcvt_to_float.to_le_bytes());
1383
1384        Ok(bytes)
1385    }
1386
1387    /// Encode F64 min/max as ARM32: VMOV + VCMP + VMRS + conditional VMOV
1388    fn encode_arm_f64_minmax(
1389        &self,
1390        dd: &VfpReg,
1391        dn: &VfpReg,
1392        dm: &VfpReg,
1393        is_min: bool,
1394    ) -> Result<Vec<u8>> {
1395        let mut bytes = Vec::new();
1396        let dn_num = vfp_dreg_to_num(dn)?;
1397        let dm_num = vfp_dreg_to_num(dm)?;
1398        let dd_num = vfp_dreg_to_num(dd)?;
1399
1400        // VMOV.F64 Dd, Dn (start with first operand)
1401        let (vd, d) = encode_dreg(dd_num);
1402        let (vn, n) = encode_dreg(dn_num);
1403        let vmov_dn = 0xEEB00B40 | (d << 22) | (vd << 12) | (n << 5) | vn;
1404        bytes.extend_from_slice(&vmov_dn.to_le_bytes());
1405
1406        // VCMP.F64 Dn, Dm
1407        let (vm, m) = encode_dreg(dm_num);
1408        let vcmp = 0xEEB40B40 | (n << 22) | (vn << 12) | (m << 5) | vm;
1409        bytes.extend_from_slice(&vcmp.to_le_bytes());
1410
1411        // VMRS APSR_nzcv, FPSCR
1412        bytes.extend_from_slice(&0xEEF1FA10u32.to_le_bytes());
1413
1414        let cond = if is_min { 0xCu32 } else { 0x4u32 };
1415        let vmov_cond = (cond << 28) | 0x0EB00B40 | (d << 22) | (vd << 12) | (m << 5) | vm;
1416        bytes.extend_from_slice(&vmov_cond.to_le_bytes());
1417
1418        Ok(bytes)
1419    }
1420
1421    /// Encode F64 copysign as ARM32
1422    fn encode_arm_f64_copysign(&self, dd: &VfpReg, dn: &VfpReg, dm: &VfpReg) -> Result<Vec<u8>> {
1423        let mut bytes = Vec::new();
1424
1425        // VMOV R0, R12, Dm (get sign source bits)
1426        let vmov_dm = encode_vmov_core_dreg(false, dm, &Reg::R0, &Reg::R12)?;
1427        bytes.extend_from_slice(&vmov_dm.to_le_bytes());
1428
1429        // VMOV R1, R2, Dn (get magnitude source bits)
1430        // We use R1 (lo) and R2 (hi) for the magnitude
1431        let vmov_dn = encode_vmov_core_dreg(false, dn, &Reg::R1, &Reg::R2)?;
1432        bytes.extend_from_slice(&vmov_dn.to_le_bytes());
1433
1434        // AND R12, R12, #0x80000000 (keep only sign bit from hi word)
1435        let and_sign = 0xE2000000u32 | (12 << 16) | (12 << 12) | (1 << 8) | 0x02;
1436        bytes.extend_from_slice(&and_sign.to_le_bytes());
1437
1438        // BIC R2, R2, #0x80000000 (clear sign bit from magnitude hi word)
1439        let bic_sign = 0xE3C00000u32 | (2 << 16) | (2 << 12) | (1 << 8) | 0x02;
1440        bytes.extend_from_slice(&bic_sign.to_le_bytes());
1441
1442        // ORR R2, R2, R12 (combine sign + magnitude)
1443        let orr = 0xE1800000u32 | (2 << 16) | (2 << 12) | 12;
1444        bytes.extend_from_slice(&orr.to_le_bytes());
1445
1446        // VMOV Dd, R1, R2
1447        let vmov_result = encode_vmov_core_dreg(true, dd, &Reg::R1, &Reg::R2)?;
1448        bytes.extend_from_slice(&vmov_result.to_le_bytes());
1449
1450        Ok(bytes)
1451    }
1452
1453    /// Encode VCVT.S32/U32.F32 + VMOV as ARM32
1454    fn encode_arm_i32_trunc_f32(&self, rd: &Reg, sm: &VfpReg, signed: bool) -> Result<Vec<u8>> {
1455        let mut bytes = Vec::new();
1456
1457        // VCVT.S32.F32 Sd, Sm (toward zero) or VCVT.U32.F32 Sd, Sm
1458        // We use Sm as both source and destination for the intermediate result
1459        let sm_num = vfp_sreg_to_num(sm)?;
1460        let (vd, d) = encode_sreg(sm_num);
1461        let (vm, m) = encode_sreg(sm_num);
1462        let base = if signed { 0xEEBD0AC0 } else { 0xEEBC0AC0 };
1463        let vcvt = base | (d << 22) | (vd << 12) | (m << 5) | vm;
1464        bytes.extend_from_slice(&vcvt.to_le_bytes());
1465
1466        // VMOV Rd, Sm — move result back to core register
1467        let vmov = encode_vmov_core_sreg(false, sm, rd)?;
1468        bytes.extend_from_slice(&vmov.to_le_bytes());
1469
1470        Ok(bytes)
1471    }
1472
1473    /// Encode an ARM instruction in Thumb-2 mode (16-bit or 32-bit instructions)
1474    fn encode_thumb(&self, op: &ArmOp) -> Result<Vec<u8>> {
1475        // Thumb-2 supports both 16-bit and 32-bit instructions
1476        // 32-bit instructions are encoded as two 16-bit halfwords (big-endian order)
1477        match op {
1478            // === 16-bit Thumb encodings ===
1479            ArmOp::Add { rd, rn, op2 } => {
1480                let rd_bits = reg_to_bits(rd) as u16;
1481                let rn_bits = reg_to_bits(rn) as u16;
1482
1483                if let Operand2::Reg(rm) = op2 {
1484                    let rm_bits = reg_to_bits(rm) as u16;
1485                    // 16-bit ADDS only has 3-bit register fields (R0-R7). For
1486                    // high registers (e.g. R12, the MemLoad/MemStore base
1487                    // scratch) the bits overflow into adjacent fields, silently
1488                    // corrupting the operands — issue #178/#180: `add ip,ip,r0`
1489                    // was emitted as `adds r4,r5,r1`. Guard on all three regs
1490                    // being low and fall back to 32-bit ADD.W otherwise, exactly
1491                    // as the Sub handler below does.
1492                    if rd_bits < 8 && rn_bits < 8 && rm_bits < 8 {
1493                        // ADDS Rd, Rn, Rm (16-bit): 0001 100 Rm Rn Rd
1494                        let instr: u16 = 0x1800 | (rm_bits << 6) | (rn_bits << 3) | rd_bits;
1495                        Ok(instr.to_le_bytes().to_vec())
1496                    } else {
1497                        // ADD.W Rd, Rn, Rm (32-bit) for high registers
1498                        self.encode_thumb32_add_reg_raw(
1499                            rd_bits as u32,
1500                            rn_bits as u32,
1501                            rm_bits as u32,
1502                        )
1503                    }
1504                } else if let Operand2::Imm(imm) = op2 {
1505                    if *imm <= 7 && rd_bits < 8 && rn_bits < 8 {
1506                        // ADDS Rd, Rn, #imm3 (16-bit): 0001 110 imm3 Rn Rd
1507                        let instr: u16 = 0x1C00 | ((*imm as u16) << 6) | (rn_bits << 3) | rd_bits;
1508                        Ok(instr.to_le_bytes().to_vec())
1509                    } else {
1510                        // Use 32-bit ADD for larger immediates
1511                        self.encode_thumb32_add(rd, rn, *imm as u32)
1512                    }
1513                } else {
1514                    // Fallback to 32-bit encoding
1515                    self.encode_thumb32_add(rd, rn, 0)
1516                }
1517            }
1518
1519            ArmOp::Sub { rd, rn, op2 } => {
1520                let rd_bits = reg_to_bits(rd) as u16;
1521                let rn_bits = reg_to_bits(rn) as u16;
1522
1523                if let Operand2::Reg(rm) = op2 {
1524                    let rm_bits = reg_to_bits(rm) as u16;
1525                    // 16-bit SUBS can only use low registers (R0-R7)
1526                    if rd_bits < 8 && rn_bits < 8 && rm_bits < 8 {
1527                        // SUBS Rd, Rn, Rm (16-bit): 0001 101 Rm Rn Rd
1528                        let instr: u16 = 0x1A00 | (rm_bits << 6) | (rn_bits << 3) | rd_bits;
1529                        Ok(instr.to_le_bytes().to_vec())
1530                    } else {
1531                        // Use 32-bit SUB.W for high registers
1532                        self.encode_thumb32_sub_reg_raw(
1533                            rd_bits as u32,
1534                            rn_bits as u32,
1535                            rm_bits as u32,
1536                        )
1537                    }
1538                } else if let Operand2::Imm(imm) = op2 {
1539                    if *imm <= 7 && rd_bits < 8 && rn_bits < 8 {
1540                        // SUBS Rd, Rn, #imm3 (16-bit): 0001 111 imm3 Rn Rd
1541                        let instr: u16 = 0x1E00 | ((*imm as u16) << 6) | (rn_bits << 3) | rd_bits;
1542                        Ok(instr.to_le_bytes().to_vec())
1543                    } else {
1544                        self.encode_thumb32_sub(rd, rn, *imm as u32)
1545                    }
1546                } else {
1547                    self.encode_thumb32_sub(rd, rn, 0)
1548                }
1549            }
1550
1551            ArmOp::Mov { rd, op2 } => {
1552                let rd_bits = reg_to_bits(rd) as u16;
1553
1554                if let Operand2::Imm(imm) = op2 {
1555                    if *imm <= 255 && rd_bits < 8 {
1556                        // MOVS Rd, #imm8 (16-bit): 0010 0 Rd imm8
1557                        let imm_bits = (*imm as u16) & 0xFF;
1558                        let instr: u16 = 0x2000 | (rd_bits << 8) | imm_bits;
1559                        Ok(instr.to_le_bytes().to_vec())
1560                    } else {
1561                        // Use 32-bit MOVW for larger immediates
1562                        self.encode_thumb32_movw(rd, *imm as u32)
1563                    }
1564                } else if let Operand2::Reg(rm) = op2 {
1565                    let rm_bits = reg_to_bits(rm) as u16;
1566                    // MOV Rd, Rm (16-bit): 0100 0110 D Rm Rd[2:0]
1567                    // D = Rd[3], Rd[2:0] in lower bits
1568                    let d_bit = (rd_bits >> 3) & 1;
1569                    let instr: u16 = 0x4600 | (d_bit << 7) | (rm_bits << 3) | (rd_bits & 0x7);
1570                    Ok(instr.to_le_bytes().to_vec())
1571                } else {
1572                    let instr: u16 = 0xBF00; // NOP fallback
1573                    Ok(instr.to_le_bytes().to_vec())
1574                }
1575            }
1576
1577            ArmOp::Push { regs } => {
1578                // Thumb-2 PUSH encoding:
1579                // If all regs in R0-R7 + LR, use 16-bit: 1011 010 M rrrrrrrr
1580                // Otherwise use 32-bit: STMDB SP!, {regs} = 1110 1001 0010 1101 | 0M0 reglist(13)
1581                let mut reg_list: u16 = 0;
1582                let mut need_32bit = false;
1583                for r in regs {
1584                    let bit = reg_to_bits(r);
1585                    if bit >= 8 && *r != Reg::LR {
1586                        need_32bit = true;
1587                    }
1588                    reg_list |= 1 << bit;
1589                }
1590                if !need_32bit {
1591                    // 16-bit PUSH: 1011 010 M rrrrrrrr
1592                    let m_bit = if reg_list & (1 << 14) != 0 {
1593                        1u16
1594                    } else {
1595                        0u16
1596                    };
1597                    let low_regs = reg_list & 0xFF;
1598                    let instr: u16 = 0xB400 | (m_bit << 8) | low_regs;
1599                    Ok(instr.to_le_bytes().to_vec())
1600                } else {
1601                    // 32-bit STMDB SP!, {regs}: E92D | reglist(16)
1602                    let hw1: u16 = 0xE92D;
1603                    let hw2: u16 = reg_list;
1604                    let mut bytes = hw1.to_le_bytes().to_vec();
1605                    bytes.extend_from_slice(&hw2.to_le_bytes());
1606                    Ok(bytes)
1607                }
1608            }
1609
1610            ArmOp::Pop { regs } => {
1611                // Thumb-2 POP encoding:
1612                // If all regs in R0-R7 + PC, use 16-bit: 1011 110 P rrrrrrrr
1613                // Otherwise use 32-bit: LDMIA SP!, {regs} = 1110 1000 1011 1101 | PM0 reglist(13)
1614                let mut reg_list: u16 = 0;
1615                let mut need_32bit = false;
1616                for r in regs {
1617                    let bit = reg_to_bits(r);
1618                    if bit >= 8 && *r != Reg::PC {
1619                        need_32bit = true;
1620                    }
1621                    reg_list |= 1 << bit;
1622                }
1623                if !need_32bit {
1624                    // 16-bit POP: 1011 110 P rrrrrrrr
1625                    let p_bit = if reg_list & (1 << 15) != 0 {
1626                        1u16
1627                    } else {
1628                        0u16
1629                    };
1630                    let low_regs = reg_list & 0xFF;
1631                    let instr: u16 = 0xBC00 | (p_bit << 8) | low_regs;
1632                    Ok(instr.to_le_bytes().to_vec())
1633                } else {
1634                    // 32-bit LDMIA SP!, {regs}: E8BD | reglist(16)
1635                    let hw1: u16 = 0xE8BD;
1636                    let hw2: u16 = reg_list;
1637                    let mut bytes = hw1.to_le_bytes().to_vec();
1638                    bytes.extend_from_slice(&hw2.to_le_bytes());
1639                    Ok(bytes)
1640                }
1641            }
1642
1643            ArmOp::Nop => {
1644                let instr: u16 = 0xBF00; // NOP in Thumb-2
1645                Ok(instr.to_le_bytes().to_vec())
1646            }
1647
1648            ArmOp::Udf { imm } => {
1649                // UDF (Undefined) in Thumb-2: 16-bit encoding is 0xDE00 | imm8
1650                // This triggers UsageFault/HardFault, used for WASM traps
1651                let instr: u16 = 0xDE00 | (*imm as u16);
1652                let bytes = instr.to_le_bytes().to_vec();
1653                encoding_contracts::verify_thumb16(&bytes);
1654                Ok(bytes)
1655            }
1656
1657            // i64 support: ADDS, ADC, SUBS, SBC for register pair arithmetic
1658            // ADDS sets flags (carry), ADC uses carry from previous ADDS
1659            ArmOp::Adds { rd, rn, op2 } => {
1660                let rd_bits = reg_to_bits(rd) as u16;
1661                let rn_bits = reg_to_bits(rn) as u16;
1662
1663                if let Operand2::Reg(rm) = op2 {
1664                    let rm_bits = reg_to_bits(rm) as u16;
1665                    // 16-bit ADDS is R0-R7 only; i64 pair allocation can place
1666                    // operands in R8-R11, which would overflow the 3-bit fields
1667                    // and corrupt the operands (#178/#180 class). Guard and fall
1668                    // back to 32-bit ADDS.W for high registers.
1669                    if rd_bits < 8 && rn_bits < 8 && rm_bits < 8 {
1670                        // ADDS Rd, Rn, Rm (16-bit): 0001 100 Rm Rn Rd
1671                        let instr: u16 = 0x1800 | (rm_bits << 6) | (rn_bits << 3) | rd_bits;
1672                        Ok(instr.to_le_bytes().to_vec())
1673                    } else {
1674                        self.encode_thumb32_adds_reg_raw(
1675                            rd_bits as u32,
1676                            rn_bits as u32,
1677                            rm_bits as u32,
1678                        )
1679                    }
1680                } else {
1681                    // 32-bit Thumb-2 ADDS with immediate
1682                    self.encode_thumb32_adds(rd, rn, 0)
1683                }
1684            }
1685
1686            // ADC: Add with Carry (Thumb-2 32-bit)
1687            // ADC.W Rd, Rn, Rm: EB40 Rn | 00 Rd 00 Rm
1688            ArmOp::Adc { rd, rn, op2 } => {
1689                let rd_bits = reg_to_bits(rd);
1690                let rn_bits = reg_to_bits(rn);
1691
1692                if let Operand2::Reg(rm) = op2 {
1693                    let rm_bits = reg_to_bits(rm);
1694                    // ADC.W Rd, Rn, Rm (T2): 1110 1011 0100 Rn | 0 000 Rd 00 00 Rm
1695                    let hw1: u16 = (0xEB40 | rn_bits) as u16;
1696                    let hw2: u16 = ((rd_bits << 8) | rm_bits) as u16;
1697
1698                    let mut bytes = hw1.to_le_bytes().to_vec();
1699                    bytes.extend_from_slice(&hw2.to_le_bytes());
1700                    Ok(bytes)
1701                } else {
1702                    // ADC with immediate - use 32-bit encoding
1703                    let hw1: u16 = (0xF140 | rn_bits) as u16;
1704                    let hw2: u16 = (rd_bits << 8) as u16;
1705                    let mut bytes = hw1.to_le_bytes().to_vec();
1706                    bytes.extend_from_slice(&hw2.to_le_bytes());
1707                    Ok(bytes)
1708                }
1709            }
1710
1711            // SUBS sets flags (borrow), SBC uses borrow from previous SUBS
1712            ArmOp::Subs { rd, rn, op2 } => {
1713                let rd_bits = reg_to_bits(rd) as u16;
1714                let rn_bits = reg_to_bits(rn) as u16;
1715
1716                if let Operand2::Reg(rm) = op2 {
1717                    let rm_bits = reg_to_bits(rm) as u16;
1718                    // 16-bit SUBS is R0-R7 only; high-register i64 pair operands
1719                    // would overflow the 3-bit fields (#178/#180 class). Guard
1720                    // and fall back to 32-bit SUBS.W for high registers.
1721                    if rd_bits < 8 && rn_bits < 8 && rm_bits < 8 {
1722                        // SUBS Rd, Rn, Rm (16-bit): 0001 101 Rm Rn Rd
1723                        let instr: u16 = 0x1A00 | (rm_bits << 6) | (rn_bits << 3) | rd_bits;
1724                        Ok(instr.to_le_bytes().to_vec())
1725                    } else {
1726                        self.encode_thumb32_subs_reg_raw(
1727                            rd_bits as u32,
1728                            rn_bits as u32,
1729                            rm_bits as u32,
1730                        )
1731                    }
1732                } else {
1733                    // 32-bit Thumb-2 SUBS with immediate
1734                    self.encode_thumb32_subs(rd, rn, 0)
1735                }
1736            }
1737
1738            // SBC: Subtract with Carry (Thumb-2 32-bit)
1739            // SBC.W Rd, Rn, Rm: EB60 Rn | 00 Rd 00 Rm
1740            ArmOp::Sbc { rd, rn, op2 } => {
1741                let rd_bits = reg_to_bits(rd);
1742                let rn_bits = reg_to_bits(rn);
1743
1744                if let Operand2::Reg(rm) = op2 {
1745                    let rm_bits = reg_to_bits(rm);
1746                    // SBC.W Rd, Rn, Rm (T2): 1110 1011 0110 Rn | 0 000 Rd 00 00 Rm
1747                    let hw1: u16 = (0xEB60 | rn_bits) as u16;
1748                    let hw2: u16 = ((rd_bits << 8) | rm_bits) as u16;
1749
1750                    let mut bytes = hw1.to_le_bytes().to_vec();
1751                    bytes.extend_from_slice(&hw2.to_le_bytes());
1752                    Ok(bytes)
1753                } else {
1754                    // SBC with immediate - use 32-bit encoding
1755                    let hw1: u16 = (0xF160 | rn_bits) as u16;
1756                    let hw2: u16 = (rd_bits << 8) as u16;
1757                    let mut bytes = hw1.to_le_bytes().to_vec();
1758                    bytes.extend_from_slice(&hw2.to_le_bytes());
1759                    Ok(bytes)
1760                }
1761            }
1762
1763            // === 32-bit Thumb-2 encodings ===
1764
1765            // SDIV: 11111011 1001 Rn 1111 Rd 1111 Rm
1766            ArmOp::Sdiv { rd, rn, rm } => {
1767                let rd_bits = reg_to_bits(rd);
1768                let rn_bits = reg_to_bits(rn);
1769                let rm_bits = reg_to_bits(rm);
1770                reg_bits_checked(rd_bits)?;
1771                reg_bits_checked(rn_bits)?;
1772                reg_bits_checked(rm_bits)?;
1773
1774                // Thumb-2 SDIV: FB90 F0F0 | Rn<<16 | Rd<<8 | Rm
1775                // First halfword: 1111 1011 1001 Rn = 0xFB90 | Rn
1776                // Second halfword: 1111 Rd 1111 Rm = 0xF0F0 | Rd<<8 | Rm
1777                let hw1: u16 = (0xFB90 | rn_bits) as u16;
1778                let hw2: u16 = (0xF0F0 | (rd_bits << 8) | rm_bits) as u16;
1779
1780                // Thumb-2 32-bit instructions: first halfword, then second halfword (little-endian each)
1781                let mut bytes = hw1.to_le_bytes().to_vec();
1782                bytes.extend_from_slice(&hw2.to_le_bytes());
1783                encoding_contracts::verify_thumb32(&bytes);
1784                Ok(bytes)
1785            }
1786
1787            // UDIV: 11111011 1011 Rn 1111 Rd 1111 Rm
1788            ArmOp::Udiv { rd, rn, rm } => {
1789                let rd_bits = reg_to_bits(rd);
1790                let rn_bits = reg_to_bits(rn);
1791                let rm_bits = reg_to_bits(rm);
1792                reg_bits_checked(rd_bits)?;
1793                reg_bits_checked(rn_bits)?;
1794                reg_bits_checked(rm_bits)?;
1795
1796                // Thumb-2 UDIV: FBB0 F0F0 | Rn<<16 | Rd<<8 | Rm
1797                let hw1: u16 = (0xFBB0 | rn_bits) as u16;
1798                let hw2: u16 = (0xF0F0 | (rd_bits << 8) | rm_bits) as u16;
1799
1800                let mut bytes = hw1.to_le_bytes().to_vec();
1801                bytes.extend_from_slice(&hw2.to_le_bytes());
1802                encoding_contracts::verify_thumb32(&bytes);
1803                Ok(bytes)
1804            }
1805
1806            ArmOp::Umull { rdlo, rdhi, rn, rm } => {
1807                let rdlo_bits = reg_to_bits(rdlo);
1808                let rdhi_bits = reg_to_bits(rdhi);
1809                let rn_bits = reg_to_bits(rn);
1810                let rm_bits = reg_to_bits(rm);
1811                reg_bits_checked(rdlo_bits)?;
1812                reg_bits_checked(rdhi_bits)?;
1813                reg_bits_checked(rn_bits)?;
1814                reg_bits_checked(rm_bits)?;
1815
1816                // Thumb-2 UMULL: 1111 1011 1010 Rn | RdLo RdHi 0000 Rm
1817                let hw1: u16 = (0xFBA0 | rn_bits) as u16;
1818                let hw2: u16 = ((rdlo_bits << 12) | (rdhi_bits << 8) | rm_bits) as u16;
1819
1820                let mut bytes = hw1.to_le_bytes().to_vec();
1821                bytes.extend_from_slice(&hw2.to_le_bytes());
1822                encoding_contracts::verify_thumb32(&bytes);
1823                Ok(bytes)
1824            }
1825
1826            // MUL (Thumb-2 32-bit): MUL Rd, Rn, Rm
1827            ArmOp::Mul { rd, rn, rm } => {
1828                let rd_bits = reg_to_bits(rd);
1829                let rn_bits = reg_to_bits(rn);
1830                let rm_bits = reg_to_bits(rm);
1831
1832                // Thumb-2 MUL: FB00 F000 | Rn | Rd<<8 | Rm
1833                // 11111011 0000 Rn | 1111 Rd 0000 Rm
1834                let hw1: u16 = (0xFB00 | rn_bits) as u16;
1835                let hw2: u16 = (0xF000 | (rd_bits << 8) | rm_bits) as u16;
1836
1837                let mut bytes = hw1.to_le_bytes().to_vec();
1838                bytes.extend_from_slice(&hw2.to_le_bytes());
1839                Ok(bytes)
1840            }
1841
1842            // MLS: Rd = Ra - Rn * Rm
1843            ArmOp::Mls { rd, rn, rm, ra } => {
1844                let rd_bits = reg_to_bits(rd);
1845                let rn_bits = reg_to_bits(rn);
1846                let rm_bits = reg_to_bits(rm);
1847                let ra_bits = reg_to_bits(ra);
1848
1849                // Thumb-2 MLS: FB00 Rn | Ra Rd 0001 Rm
1850                // 11111011 0000 Rn | Ra Rd 0001 Rm
1851                let hw1: u16 = (0xFB00 | rn_bits) as u16;
1852                let hw2: u16 = ((ra_bits << 12) | (rd_bits << 8) | 0x10 | rm_bits) as u16;
1853
1854                let mut bytes = hw1.to_le_bytes().to_vec();
1855                bytes.extend_from_slice(&hw2.to_le_bytes());
1856                Ok(bytes)
1857            }
1858
1859            // AND (Thumb-2 32-bit)
1860            ArmOp::And { rd, rn, op2 } => {
1861                if let Operand2::Reg(rm) = op2 {
1862                    let rd_bits = reg_to_bits(rd);
1863                    let rn_bits = reg_to_bits(rn);
1864                    let rm_bits = reg_to_bits(rm);
1865
1866                    // Thumb-2 AND register: EA00 Rn | 0 Rd 00 00 Rm
1867                    let hw1: u16 = (0xEA00 | rn_bits) as u16;
1868                    let hw2: u16 = ((rd_bits << 8) | rm_bits) as u16;
1869
1870                    let mut bytes = hw1.to_le_bytes().to_vec();
1871                    bytes.extend_from_slice(&hw2.to_le_bytes());
1872                    Ok(bytes)
1873                } else if let Operand2::Imm(imm) = op2 {
1874                    let rd_bits = reg_to_bits(rd);
1875                    let rn_bits = reg_to_bits(rn);
1876                    let imm_val = *imm as u32;
1877
1878                    // Thumb-2 AND.W immediate T1: 11110 i 0 0000 S Rn | 0 imm3 Rd imm8
1879                    let i_bit = (imm_val >> 11) & 1;
1880                    let imm3 = (imm_val >> 8) & 0x7;
1881                    let imm8 = imm_val & 0xFF;
1882
1883                    let hw1: u16 = (0xF000 | (i_bit << 10) | rn_bits) as u16;
1884                    let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
1885
1886                    let mut bytes = hw1.to_le_bytes().to_vec();
1887                    bytes.extend_from_slice(&hw2.to_le_bytes());
1888                    Ok(bytes)
1889                } else {
1890                    // RegShift variant - fallback to NOP
1891                    let instr: u16 = 0xBF00;
1892                    Ok(instr.to_le_bytes().to_vec())
1893                }
1894            }
1895
1896            // ORR (Thumb-2 32-bit)
1897            ArmOp::Orr { rd, rn, op2 } => {
1898                if let Operand2::Reg(rm) = op2 {
1899                    let rd_bits = reg_to_bits(rd);
1900                    let rn_bits = reg_to_bits(rn);
1901                    let rm_bits = reg_to_bits(rm);
1902
1903                    // Thumb-2 ORR: EA40 Rn | 0 Rd 00 00 Rm
1904                    let hw1: u16 = (0xEA40 | rn_bits) as u16;
1905                    let hw2: u16 = ((rd_bits << 8) | rm_bits) as u16;
1906
1907                    let mut bytes = hw1.to_le_bytes().to_vec();
1908                    bytes.extend_from_slice(&hw2.to_le_bytes());
1909                    Ok(bytes)
1910                } else {
1911                    let instr: u16 = 0xBF00;
1912                    Ok(instr.to_le_bytes().to_vec())
1913                }
1914            }
1915
1916            // EOR (Thumb-2 32-bit)
1917            ArmOp::Eor { rd, rn, op2 } => {
1918                if let Operand2::Reg(rm) = op2 {
1919                    let rd_bits = reg_to_bits(rd);
1920                    let rn_bits = reg_to_bits(rn);
1921                    let rm_bits = reg_to_bits(rm);
1922
1923                    // Thumb-2 EOR: EA80 Rn | 0 Rd 00 00 Rm
1924                    let hw1: u16 = (0xEA80 | rn_bits) as u16;
1925                    let hw2: u16 = ((rd_bits << 8) | rm_bits) as u16;
1926
1927                    let mut bytes = hw1.to_le_bytes().to_vec();
1928                    bytes.extend_from_slice(&hw2.to_le_bytes());
1929                    Ok(bytes)
1930                } else {
1931                    let instr: u16 = 0xBF00;
1932                    Ok(instr.to_le_bytes().to_vec())
1933                }
1934            }
1935
1936            // Shift operations (16-bit for low registers)
1937            ArmOp::Lsl { rd, rn, shift } => {
1938                let rd_bits = reg_to_bits(rd) as u16;
1939                let rn_bits = reg_to_bits(rn) as u16;
1940                let shift_bits = (*shift as u16) & 0x1F;
1941
1942                if rd_bits < 8 && rn_bits < 8 {
1943                    // LSLS Rd, Rm, #imm5 (16-bit): 0000 0 imm5 Rm Rd
1944                    let instr: u16 = (shift_bits << 6) | (rn_bits << 3) | rd_bits;
1945                    Ok(instr.to_le_bytes().to_vec())
1946                } else {
1947                    // Use 32-bit encoding for high registers
1948                    self.encode_thumb32_shift(rd, rn, *shift, 0b00) // LSL type
1949                }
1950            }
1951
1952            ArmOp::Lsr { rd, rn, shift } => {
1953                let rd_bits = reg_to_bits(rd) as u16;
1954                let rn_bits = reg_to_bits(rn) as u16;
1955                let shift_bits = (*shift as u16) & 0x1F;
1956
1957                if rd_bits < 8 && rn_bits < 8 && shift_bits > 0 {
1958                    // LSRS Rd, Rm, #imm5 (16-bit): 0000 1 imm5 Rm Rd
1959                    let instr: u16 = 0x0800 | (shift_bits << 6) | (rn_bits << 3) | rd_bits;
1960                    Ok(instr.to_le_bytes().to_vec())
1961                } else {
1962                    self.encode_thumb32_shift(rd, rn, *shift, 0b01) // LSR type
1963                }
1964            }
1965
1966            ArmOp::Asr { rd, rn, shift } => {
1967                let rd_bits = reg_to_bits(rd) as u16;
1968                let rn_bits = reg_to_bits(rn) as u16;
1969                let shift_bits = (*shift as u16) & 0x1F;
1970
1971                if rd_bits < 8 && rn_bits < 8 && shift_bits > 0 {
1972                    // ASRS Rd, Rm, #imm5 (16-bit): 0001 0 imm5 Rm Rd
1973                    let instr: u16 = 0x1000 | (shift_bits << 6) | (rn_bits << 3) | rd_bits;
1974                    Ok(instr.to_le_bytes().to_vec())
1975                } else {
1976                    self.encode_thumb32_shift(rd, rn, *shift, 0b10) // ASR type
1977                }
1978            }
1979
1980            ArmOp::Ror { rd, rn, shift } => {
1981                // ROR doesn't have a 16-bit immediate form, use 32-bit
1982                self.encode_thumb32_shift(rd, rn, *shift, 0b11) // ROR type
1983            }
1984
1985            // Register-based shifts (Thumb-2 32-bit)
1986            // Encoding: 11111010 0xxS Rn 1111 Rd 0000 Rm
1987            // xx = shift type: 00=LSL, 01=LSR, 10=ASR, 11=ROR
1988            ArmOp::LslReg { rd, rn, rm } => self.encode_thumb32_shift_reg(rd, rn, rm, 0b00),
1989            ArmOp::LsrReg { rd, rn, rm } => self.encode_thumb32_shift_reg(rd, rn, rm, 0b01),
1990            ArmOp::AsrReg { rd, rn, rm } => self.encode_thumb32_shift_reg(rd, rn, rm, 0b10),
1991            ArmOp::RorReg { rd, rn, rm } => self.encode_thumb32_shift_reg(rd, rn, rm, 0b11),
1992
1993            // RSB (Reverse Subtract): Rd = imm - Rn
1994            // Thumb-2 T2 encoding: 11110 i 0 1110 S Rn | 0 imm3 Rd imm8
1995            ArmOp::Rsb { rd, rn, imm } => {
1996                let rd_bits = reg_to_bits(rd);
1997                let rn_bits = reg_to_bits(rn);
1998                let imm_val = *imm;
1999
2000                let i_bit = (imm_val >> 11) & 1;
2001                let imm3 = (imm_val >> 8) & 0x7;
2002                let imm8 = imm_val & 0xFF;
2003
2004                // hw1: 11110 i 01110 0 Rn  (S=0)
2005                let hw1: u16 = (0xF1C0 | (i_bit << 10) | rn_bits) as u16;
2006                // hw2: 0 imm3 Rd imm8
2007                let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
2008
2009                let mut bytes = hw1.to_le_bytes().to_vec();
2010                bytes.extend_from_slice(&hw2.to_le_bytes());
2011                Ok(bytes)
2012            }
2013
2014            // CLZ (Thumb-2 32-bit)
2015            ArmOp::Clz { rd, rm } => {
2016                let rd_bits = reg_to_bits(rd);
2017                let rm_bits = reg_to_bits(rm);
2018
2019                // Thumb-2 CLZ: FAB0 Rm | F8 Rd Rm
2020                // 11111010 1011 Rm | 1111 1000 Rd Rm
2021                let hw1: u16 = (0xFAB0 | rm_bits) as u16;
2022                let hw2: u16 = (0xF080 | (rd_bits << 8) | rm_bits) as u16;
2023
2024                let mut bytes = hw1.to_le_bytes().to_vec();
2025                bytes.extend_from_slice(&hw2.to_le_bytes());
2026                Ok(bytes)
2027            }
2028
2029            // RBIT (Thumb-2 32-bit)
2030            ArmOp::Rbit { rd, rm } => {
2031                let rd_bits = reg_to_bits(rd);
2032                let rm_bits = reg_to_bits(rm);
2033
2034                // Thumb-2 RBIT: FA90 Rm | F0 Rd A0 Rm
2035                // 11111010 1001 Rm | 1111 Rd 1010 Rm
2036                let hw1: u16 = (0xFA90 | rm_bits) as u16;
2037                let hw2: u16 = (0xF0A0 | (rd_bits << 8) | rm_bits) as u16;
2038
2039                let mut bytes = hw1.to_le_bytes().to_vec();
2040                bytes.extend_from_slice(&hw2.to_le_bytes());
2041                Ok(bytes)
2042            }
2043
2044            // SXTB (16-bit for low registers)
2045            ArmOp::Sxtb { rd, rm } => {
2046                let rd_bits = reg_to_bits(rd) as u16;
2047                let rm_bits = reg_to_bits(rm) as u16;
2048
2049                if rd_bits < 8 && rm_bits < 8 {
2050                    // SXTB Rd, Rm (16-bit): 1011 0010 01 Rm Rd
2051                    let instr: u16 = 0xB240 | (rm_bits << 3) | rd_bits;
2052                    Ok(instr.to_le_bytes().to_vec())
2053                } else {
2054                    // Thumb-2 SXTB.W: FA4F F(rd)80 (rm)
2055                    // 11111010 0100 1111 | 1111 Rd 10 rotate Rm
2056                    let rd_bits32 = rd_bits as u32;
2057                    let rm_bits32 = rm_bits as u32;
2058                    let hw1: u16 = 0xFA4F;
2059                    let hw2: u16 = (0xF080 | (rd_bits32 << 8) | rm_bits32) as u16;
2060                    let mut bytes = hw1.to_le_bytes().to_vec();
2061                    bytes.extend_from_slice(&hw2.to_le_bytes());
2062                    Ok(bytes)
2063                }
2064            }
2065
2066            // SXTH (16-bit for low registers)
2067            ArmOp::Sxth { rd, rm } => {
2068                let rd_bits = reg_to_bits(rd) as u16;
2069                let rm_bits = reg_to_bits(rm) as u16;
2070
2071                if rd_bits < 8 && rm_bits < 8 {
2072                    // SXTH Rd, Rm (16-bit): 1011 0010 00 Rm Rd
2073                    let instr: u16 = 0xB200 | (rm_bits << 3) | rd_bits;
2074                    Ok(instr.to_le_bytes().to_vec())
2075                } else {
2076                    // Thumb-2 SXTH.W: FA0F F(rd)80 (rm)
2077                    // 11111010 0000 1111 | 1111 Rd 10 rotate Rm
2078                    let rd_bits32 = rd_bits as u32;
2079                    let rm_bits32 = rm_bits as u32;
2080                    let hw1: u16 = 0xFA0F;
2081                    let hw2: u16 = (0xF080 | (rd_bits32 << 8) | rm_bits32) as u16;
2082                    let mut bytes = hw1.to_le_bytes().to_vec();
2083                    bytes.extend_from_slice(&hw2.to_le_bytes());
2084                    Ok(bytes)
2085                }
2086            }
2087
2088            // CMP (can be 16-bit for low registers)
2089            ArmOp::Cmp { rn, op2 } => {
2090                let rn_bits = reg_to_bits(rn) as u16;
2091
2092                if let Operand2::Imm(imm) = op2 {
2093                    // Only use 16-bit encoding for non-negative immediates 0-255
2094                    // Negative immediates must use 32-bit encoding
2095                    if *imm >= 0 && *imm <= 255 && rn_bits < 8 {
2096                        // CMP Rn, #imm8 (16-bit): 0010 1 Rn imm8
2097                        let instr: u16 = 0x2800 | (rn_bits << 8) | (*imm as u16 & 0xFF);
2098                        Ok(instr.to_le_bytes().to_vec())
2099                    } else {
2100                        self.encode_thumb32_cmp_imm(rn, *imm as u32)
2101                    }
2102                } else if let Operand2::Reg(rm) = op2 {
2103                    let rm_bits = reg_to_bits(rm) as u16;
2104                    if rn_bits < 8 && rm_bits < 8 {
2105                        // CMP Rn, Rm (16-bit low): 0100 0010 10 Rm Rn
2106                        let instr: u16 = 0x4280 | (rm_bits << 3) | rn_bits;
2107                        Ok(instr.to_le_bytes().to_vec())
2108                    } else {
2109                        // CMP Rn, Rm (16-bit high): 0100 0101 N Rm Rn[2:0]
2110                        let n_bit = (rn_bits >> 3) & 1;
2111                        let instr: u16 = 0x4500 | (n_bit << 7) | (rm_bits << 3) | (rn_bits & 0x7);
2112                        Ok(instr.to_le_bytes().to_vec())
2113                    }
2114                } else {
2115                    let instr: u16 = 0xBF00;
2116                    Ok(instr.to_le_bytes().to_vec())
2117                }
2118            }
2119
2120            // CMN (Compare Negative) - computes Rn + op2 and sets flags
2121            // CMN Rn, #1 sets Z flag if Rn == -1 (since -1 + 1 = 0)
2122            ArmOp::Cmn { rn, op2 } => {
2123                let rn_bits = reg_to_bits(rn) as u16;
2124
2125                if let Operand2::Imm(imm) = op2 {
2126                    // CMN.W Rn, #imm (32-bit encoding)
2127                    // Encoding: F110 Rn | 0F00 imm8 (for small immediates 0-255)
2128                    if *imm >= 0 && *imm <= 255 {
2129                        let imm8 = *imm as u16 & 0xFF;
2130                        let hw1: u16 = 0xF110 | rn_bits;
2131                        let hw2: u16 = 0x0F00 | imm8;
2132                        let mut bytes = hw1.to_le_bytes().to_vec();
2133                        bytes.extend_from_slice(&hw2.to_le_bytes());
2134                        Ok(bytes)
2135                    } else {
2136                        // For other immediates, fallback to NOP (should not happen in our use case)
2137                        Ok(vec![0xBF, 0x00])
2138                    }
2139                } else if let Operand2::Reg(rm) = op2 {
2140                    let rm_bits = reg_to_bits(rm) as u16;
2141                    // 16-bit CMN (T1) only encodes R0-R7; high registers overflow
2142                    // the 3-bit fields and corrupt the operands (#184, the #180
2143                    // class). CMN has no high-register 16-bit form, so fall back
2144                    // to 32-bit CMN.W (T2): EB10 Rn | 0F00 Rm (ADD.W with S=1 and
2145                    // Rd discarded as PC/1111).
2146                    if rn_bits < 8 && rm_bits < 8 {
2147                        // CMN Rn, Rm (16-bit): 0100 0010 11 Rm Rn
2148                        let instr: u16 = 0x42C0 | (rm_bits << 3) | rn_bits;
2149                        Ok(instr.to_le_bytes().to_vec())
2150                    } else {
2151                        let hw1: u16 = 0xEB10 | rn_bits;
2152                        let hw2: u16 = 0x0F00 | rm_bits;
2153                        let mut bytes = hw1.to_le_bytes().to_vec();
2154                        bytes.extend_from_slice(&hw2.to_le_bytes());
2155                        Ok(bytes)
2156                    }
2157                } else {
2158                    Ok(vec![0xBF, 0x00])
2159                }
2160            }
2161
2162            // LDR (can be 16-bit for simple cases)
2163            ArmOp::Ldr { rd, addr } => {
2164                let rd_bits = reg_to_bits(rd);
2165                let base_bits = reg_to_bits(&addr.base);
2166
2167                // Handle register offset mode [base, Roff] or [base, Roff, #imm]
2168                if let Some(offset_reg) = &addr.offset_reg {
2169                    let rm_bits = reg_to_bits(offset_reg);
2170
2171                    // If there's also an immediate offset, we need to ADD it first
2172                    if addr.offset != 0 {
2173                        // Use R12 (IP) as scratch to avoid clobbering the address register
2174                        // ADD R12, Rm, #offset; LDR Rd, [base, R12]
2175                        let scratch = Reg::R12;
2176                        let mut bytes =
2177                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2178                        bytes.extend(self.encode_thumb32_ldr_reg(rd, &addr.base, &scratch)?);
2179                        return Ok(bytes);
2180                    }
2181
2182                    // Simple register offset: LDR Rd, [Rn, Rm]
2183                    // 16-bit: only if Rd, Rn, Rm < R8
2184                    if rd_bits < 8 && base_bits < 8 && rm_bits < 8 {
2185                        // LDR Rd, [Rn, Rm] (16-bit): 0101 100 Rm Rn Rd
2186                        let instr: u16 = 0x5800
2187                            | ((rm_bits as u16) << 6)
2188                            | ((base_bits as u16) << 3)
2189                            | (rd_bits as u16);
2190                        return Ok(instr.to_le_bytes().to_vec());
2191                    }
2192
2193                    // 32-bit register offset
2194                    return self.encode_thumb32_ldr_reg(rd, &addr.base, offset_reg);
2195                }
2196
2197                // Immediate offset mode [base, #imm]
2198                let offset = addr.offset as u32;
2199
2200                if rd_bits < 8 && base_bits < 8 && (offset & 0x3) == 0 && offset <= 124 {
2201                    // LDR Rd, [Rn, #imm5*4] (16-bit): 0110 1 imm5 Rn Rd
2202                    let imm5 = (offset >> 2) as u16;
2203                    let instr: u16 =
2204                        0x6800 | (imm5 << 6) | ((base_bits as u16) << 3) | (rd_bits as u16);
2205                    Ok(instr.to_le_bytes().to_vec())
2206                } else {
2207                    self.encode_thumb32_ldr(rd, &addr.base, offset)
2208                }
2209            }
2210
2211            // STR (can be 16-bit for simple cases)
2212            ArmOp::Str { rd, addr } => {
2213                let rd_bits = reg_to_bits(rd);
2214                let base_bits = reg_to_bits(&addr.base);
2215
2216                // Handle register offset mode [base, Roff] or [base, Roff, #imm]
2217                if let Some(offset_reg) = &addr.offset_reg {
2218                    let rm_bits = reg_to_bits(offset_reg);
2219
2220                    // If there's also an immediate offset, we need to ADD it first
2221                    if addr.offset != 0 {
2222                        // Use R12 (IP) as scratch to avoid clobbering the address register
2223                        // ADD R12, Rm, #offset; STR Rd, [base, R12]
2224                        let scratch = Reg::R12;
2225                        let mut bytes =
2226                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2227                        bytes.extend(self.encode_thumb32_str_reg(rd, &addr.base, &scratch)?);
2228                        return Ok(bytes);
2229                    }
2230
2231                    // Simple register offset: STR Rd, [Rn, Rm]
2232                    // 16-bit: only if Rd, Rn, Rm < R8
2233                    if rd_bits < 8 && base_bits < 8 && rm_bits < 8 {
2234                        // STR Rd, [Rn, Rm] (16-bit): 0101 000 Rm Rn Rd
2235                        let instr: u16 = 0x5000
2236                            | ((rm_bits as u16) << 6)
2237                            | ((base_bits as u16) << 3)
2238                            | (rd_bits as u16);
2239                        return Ok(instr.to_le_bytes().to_vec());
2240                    }
2241
2242                    // 32-bit register offset
2243                    return self.encode_thumb32_str_reg(rd, &addr.base, offset_reg);
2244                }
2245
2246                // Immediate offset mode [base, #imm]
2247                let offset = addr.offset as u32;
2248
2249                if rd_bits < 8 && base_bits < 8 && (offset & 0x3) == 0 && offset <= 124 {
2250                    // STR Rd, [Rn, #imm5*4] (16-bit): 0110 0 imm5 Rn Rd
2251                    let imm5 = (offset >> 2) as u16;
2252                    let instr: u16 =
2253                        0x6000 | (imm5 << 6) | ((base_bits as u16) << 3) | (rd_bits as u16);
2254                    Ok(instr.to_le_bytes().to_vec())
2255                } else {
2256                    self.encode_thumb32_str(rd, &addr.base, offset)
2257                }
2258            }
2259
2260            // LDRB (Thumb-2)
2261            ArmOp::Ldrb { rd, addr } => {
2262                let rd_bits = reg_to_bits(rd);
2263                let base_bits = reg_to_bits(&addr.base);
2264
2265                if let Some(offset_reg) = &addr.offset_reg {
2266                    if addr.offset != 0 {
2267                        let scratch = Reg::R12;
2268                        let mut bytes =
2269                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2270                        bytes.extend(self.encode_thumb32_ldrb_reg(rd, &addr.base, &scratch)?);
2271                        return Ok(bytes);
2272                    }
2273                    return self.encode_thumb32_ldrb_reg(rd, &addr.base, offset_reg);
2274                }
2275
2276                let offset = addr.offset as u32;
2277                if rd_bits < 8 && base_bits < 8 && offset <= 31 {
2278                    // LDRB Rd, [Rn, #imm5] (16-bit): 0111 1 imm5 Rn Rd
2279                    let instr: u16 = 0x7800
2280                        | ((offset as u16) << 6)
2281                        | ((base_bits as u16) << 3)
2282                        | (rd_bits as u16);
2283                    Ok(instr.to_le_bytes().to_vec())
2284                } else {
2285                    self.encode_thumb32_ldrb_imm(rd, &addr.base, offset)
2286                }
2287            }
2288
2289            // LDRSB (Thumb-2)
2290            ArmOp::Ldrsb { rd, addr } => {
2291                let rd_bits = reg_to_bits(rd);
2292                let base_bits = reg_to_bits(&addr.base);
2293
2294                if let Some(offset_reg) = &addr.offset_reg {
2295                    if addr.offset != 0 {
2296                        let scratch = Reg::R12;
2297                        let mut bytes =
2298                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2299                        bytes.extend(self.encode_thumb32_ldrsb_reg(rd, &addr.base, &scratch)?);
2300                        return Ok(bytes);
2301                    }
2302                    return self.encode_thumb32_ldrsb_reg(rd, &addr.base, offset_reg);
2303                }
2304
2305                let offset = addr.offset as u32;
2306                // LDRSB has no 16-bit immediate form (only register)
2307                // For 16-bit reg form: only if Rd, Rn, Rm < R8
2308                if rd_bits < 8 && base_bits < 8 && offset == 0 {
2309                    // No immediate 16-bit encoding for LDRSB; use 32-bit
2310                    self.encode_thumb32_ldrsb_imm(rd, &addr.base, offset)
2311                } else {
2312                    self.encode_thumb32_ldrsb_imm(rd, &addr.base, offset)
2313                }
2314            }
2315
2316            // LDRH (Thumb-2)
2317            ArmOp::Ldrh { rd, addr } => {
2318                let rd_bits = reg_to_bits(rd);
2319                let base_bits = reg_to_bits(&addr.base);
2320
2321                if let Some(offset_reg) = &addr.offset_reg {
2322                    if addr.offset != 0 {
2323                        let scratch = Reg::R12;
2324                        let mut bytes =
2325                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2326                        bytes.extend(self.encode_thumb32_ldrh_reg(rd, &addr.base, &scratch)?);
2327                        return Ok(bytes);
2328                    }
2329                    return self.encode_thumb32_ldrh_reg(rd, &addr.base, offset_reg);
2330                }
2331
2332                let offset = addr.offset as u32;
2333                if rd_bits < 8 && base_bits < 8 && (offset & 0x1) == 0 && offset <= 62 {
2334                    // LDRH Rd, [Rn, #imm5*2] (16-bit): 1000 1 imm5 Rn Rd
2335                    let imm5 = (offset >> 1) as u16;
2336                    let instr: u16 =
2337                        0x8800 | (imm5 << 6) | ((base_bits as u16) << 3) | (rd_bits as u16);
2338                    Ok(instr.to_le_bytes().to_vec())
2339                } else {
2340                    self.encode_thumb32_ldrh_imm(rd, &addr.base, offset)
2341                }
2342            }
2343
2344            // LDRSH (Thumb-2)
2345            ArmOp::Ldrsh { rd, addr } => {
2346                if let Some(offset_reg) = &addr.offset_reg {
2347                    if addr.offset != 0 {
2348                        let scratch = Reg::R12;
2349                        let mut bytes =
2350                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2351                        bytes.extend(self.encode_thumb32_ldrsh_reg(rd, &addr.base, &scratch)?);
2352                        return Ok(bytes);
2353                    }
2354                    return self.encode_thumb32_ldrsh_reg(rd, &addr.base, offset_reg);
2355                }
2356
2357                let offset = addr.offset as u32;
2358                self.encode_thumb32_ldrsh_imm(rd, &addr.base, offset)
2359            }
2360
2361            // STRB (Thumb-2)
2362            ArmOp::Strb { rd, addr } => {
2363                let rd_bits = reg_to_bits(rd);
2364                let base_bits = reg_to_bits(&addr.base);
2365
2366                if let Some(offset_reg) = &addr.offset_reg {
2367                    if addr.offset != 0 {
2368                        let scratch = Reg::R12;
2369                        let mut bytes =
2370                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2371                        bytes.extend(self.encode_thumb32_strb_reg(rd, &addr.base, &scratch)?);
2372                        return Ok(bytes);
2373                    }
2374                    return self.encode_thumb32_strb_reg(rd, &addr.base, offset_reg);
2375                }
2376
2377                let offset = addr.offset as u32;
2378                if rd_bits < 8 && base_bits < 8 && offset <= 31 {
2379                    // STRB Rd, [Rn, #imm5] (16-bit): 0111 0 imm5 Rn Rd
2380                    let instr: u16 = 0x7000
2381                        | ((offset as u16) << 6)
2382                        | ((base_bits as u16) << 3)
2383                        | (rd_bits as u16);
2384                    Ok(instr.to_le_bytes().to_vec())
2385                } else {
2386                    self.encode_thumb32_strb_imm(rd, &addr.base, offset)
2387                }
2388            }
2389
2390            // STRH (Thumb-2)
2391            ArmOp::Strh { rd, addr } => {
2392                let rd_bits = reg_to_bits(rd);
2393                let base_bits = reg_to_bits(&addr.base);
2394
2395                if let Some(offset_reg) = &addr.offset_reg {
2396                    if addr.offset != 0 {
2397                        let scratch = Reg::R12;
2398                        let mut bytes =
2399                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2400                        bytes.extend(self.encode_thumb32_strh_reg(rd, &addr.base, &scratch)?);
2401                        return Ok(bytes);
2402                    }
2403                    return self.encode_thumb32_strh_reg(rd, &addr.base, offset_reg);
2404                }
2405
2406                let offset = addr.offset as u32;
2407                if rd_bits < 8 && base_bits < 8 && (offset & 0x1) == 0 && offset <= 62 {
2408                    // STRH Rd, [Rn, #imm5*2] (16-bit): 1000 0 imm5 Rn Rd
2409                    let imm5 = (offset >> 1) as u16;
2410                    let instr: u16 =
2411                        0x8000 | (imm5 << 6) | ((base_bits as u16) << 3) | (rd_bits as u16);
2412                    Ok(instr.to_le_bytes().to_vec())
2413                } else {
2414                    self.encode_thumb32_strh_imm(rd, &addr.base, offset)
2415                }
2416            }
2417
2418            // MemorySize (Thumb-2)
2419            ArmOp::MemorySize { rd } => {
2420                // LSR rd, R10, #16 — memory size in bytes / 65536 = pages
2421                // Thumb-2 16-bit: LSRS Rd, Rm, #imm5 — 0000 1 imm5 Rm Rd
2422                let rd_bits = reg_to_bits(rd);
2423                let r10_bits = reg_to_bits(&Reg::R10);
2424                if rd_bits < 8 && r10_bits < 8 {
2425                    let instr: u16 =
2426                        0x0800 | (16u16 << 6) | ((r10_bits as u16) << 3) | (rd_bits as u16);
2427                    Ok(instr.to_le_bytes().to_vec())
2428                } else {
2429                    // Thumb-2 32-bit LSR: 1110 1010 010 0 1111 | 0 imm3 Rd imm2 01 Rm
2430                    let imm5: u32 = 16;
2431                    let imm3 = (imm5 >> 2) & 0x7;
2432                    let imm2 = imm5 & 0x3;
2433                    let hw1: u16 = 0xEA4F;
2434                    let hw2: u16 =
2435                        ((imm3 << 12) | (rd_bits << 8) | (imm2 << 6) | 0x10 | r10_bits) as u16;
2436                    let mut bytes = hw1.to_le_bytes().to_vec();
2437                    bytes.extend_from_slice(&hw2.to_le_bytes());
2438                    Ok(bytes)
2439                }
2440            }
2441
2442            // MemoryGrow (Thumb-2)
2443            ArmOp::MemoryGrow { rd, .. } => {
2444                // On embedded with fixed memory, always return -1 (failure)
2445                // MVN rd, #0 → MOV rd, #-1
2446                // Thumb-2 32-bit: MVN: 1111 0 i 0 0 0 1 1 0 1111 | 0 imm3 Rd imm8
2447                let rd_bits = reg_to_bits(rd);
2448                let hw1: u16 = 0xF06F; // MVN with i=0
2449                let hw2: u16 = (rd_bits << 8) as u16; // imm8=0 → ~0 = 0xFFFFFFFF = -1
2450                let mut bytes = hw1.to_le_bytes().to_vec();
2451                bytes.extend_from_slice(&hw2.to_le_bytes());
2452                Ok(bytes)
2453            }
2454
2455            // BX (16-bit)
2456            ArmOp::Bx { rm } => {
2457                let rm_bits = reg_to_bits(rm) as u16;
2458                // BX Rm (16-bit): 0100 0111 0 Rm 000
2459                let instr: u16 = 0x4700 | (rm_bits << 3);
2460                Ok(instr.to_le_bytes().to_vec())
2461            }
2462
2463            // BLX (16-bit) - Branch with Link and Exchange
2464            // BLX Rm: 0100 0111 1 Rm 000
2465            ArmOp::Blx { rm } => {
2466                let rm_bits = reg_to_bits(rm) as u16;
2467                let instr: u16 = 0x4780 | (rm_bits << 3);
2468                Ok(instr.to_le_bytes().to_vec())
2469            }
2470
2471            // CallIndirect - indirect function call via table lookup
2472            // table_index_reg contains the table index
2473            // Generates: LSL R12, idx, #2; LDR R12, [R12, table_base]; BLX R12
2474            ArmOp::CallIndirect {
2475                rd: _,
2476                type_idx: _,
2477                table_index_reg,
2478            } => {
2479                let idx_reg = reg_to_bits(table_index_reg);
2480                let mut bytes = Vec::new();
2481
2482                // For now, we generate code that:
2483                // 1. Multiplies index by 4 (function pointer size)
2484                // 2. Loads function pointer from table (assumes table base in R11)
2485                // 3. Calls the function via BLX
2486                //
2487                // Table base setup must be done by caller/runtime.
2488                // This is a simplified implementation - full support needs:
2489                // - Table base address resolution
2490                // - Type signature checking
2491                // - Bounds checking
2492
2493                // LSL R12, idx_reg, #2 (multiply index by 4)
2494                // Thumb-2 MOV with shift: 11101010 010 S 1111 | 0 imm3 Rd imm2 type Rm
2495                // LSL: type=00, imm5=2 -> imm3=0, imm2=10
2496                let hw1: u16 = 0xEA4F_u16; // MOV.W R12, Rm, LSL #2
2497                let hw2: u16 = ((0x0C00 | (0b10 << 4)) | idx_reg) as u16;
2498                bytes.extend_from_slice(&hw1.to_le_bytes());
2499                bytes.extend_from_slice(&hw2.to_le_bytes());
2500
2501                // LDR R12, [R11, R12] - load function pointer
2502                // Thumb-2 LDR (register): 1111 1000 0101 Rn | Rt 0000 00 imm2 Rm
2503                // Rn=R11, Rt=R12, Rm=R12, imm2=00 (no shift)
2504                let ldr_hw1: u16 = 0xF85B; // LDR.W Rt, [R11, Rm]
2505                let ldr_hw2: u16 = 0xC00C; // Rt=R12, imm2=00, Rm=R12
2506                bytes.extend_from_slice(&ldr_hw1.to_le_bytes());
2507                bytes.extend_from_slice(&ldr_hw2.to_le_bytes());
2508
2509                // BLX R12 (call function indirectly)
2510                // BLX Rm (16-bit): 0100 0111 1 Rm 000
2511                let blx: u16 = 0x47E0; // BLX R12
2512                bytes.extend_from_slice(&blx.to_le_bytes());
2513
2514                Ok(bytes)
2515            }
2516
2517            // Label pseudo-instruction: emits no machine code
2518            ArmOp::Label { .. } => Ok(Vec::new()),
2519
2520            // Conditional branch to label (generic) - offset 0, will be patched
2521            ArmOp::Bcc { cond, label: _ } => {
2522                use synth_synthesis::Condition;
2523                let cond_bits: u16 = match cond {
2524                    Condition::EQ => 0x0,
2525                    Condition::NE => 0x1,
2526                    Condition::HS => 0x2,
2527                    Condition::LO => 0x3,
2528                    Condition::HI => 0x8,
2529                    Condition::LS => 0x9,
2530                    Condition::GE => 0xA,
2531                    Condition::LT => 0xB,
2532                    Condition::GT => 0xC,
2533                    Condition::LE => 0xD,
2534                };
2535                // 16-bit B<cond> with offset 0: 1101 cond imm8
2536                let instr: u16 = 0xD000 | (cond_bits << 8);
2537                Ok(instr.to_le_bytes().to_vec())
2538            }
2539
2540            // Branch instructions
2541            ArmOp::B { label: _ } => {
2542                // Simplified: B.N with offset 0
2543                // For real usage, would need label resolution
2544                let instr: u16 = 0xE000; // B.N #0
2545                Ok(instr.to_le_bytes().to_vec())
2546            }
2547
2548            // BHS (Branch if Higher or Same) - used for bounds checking
2549            // Condition code: 0x2 (C set)
2550            ArmOp::Bhs { label: _ } => {
2551                // 16-bit B<cond> with offset 0: 1101 cond imm8
2552                // cond = 0x2 (HS)
2553                let instr: u16 = 0xD200; // BHS.N #0
2554                Ok(instr.to_le_bytes().to_vec())
2555            }
2556
2557            // BLO (Branch if Lower) - complementary to BHS
2558            // Condition code: 0x3 (C clear)
2559            ArmOp::Blo { label: _ } => {
2560                // 16-bit B<cond> with offset 0: 1101 cond imm8
2561                // cond = 0x3 (LO)
2562                let instr: u16 = 0xD300; // BLO.N #0
2563                Ok(instr.to_le_bytes().to_vec())
2564            }
2565
2566            // Branch with numeric offset (Thumb-2)
2567            // Thumb-2 B.W instruction: 32-bit with +-16MB range
2568            ArmOp::BOffset { offset } => {
2569                // offset is already the halfword displacement: (target - branch - 4) / 2
2570                // This is the raw encoded value, accounting for variable-length instructions
2571                let halfword_offset = *offset;
2572
2573                // 16-bit B.N encoding: 1110 0 imm11 (11-bit signed halfword offset)
2574                // Range: -1024 to +1022 halfwords
2575                if (-1024..=1022).contains(&halfword_offset) {
2576                    // 16-bit B.N encoding: 1110 0 imm11
2577                    let imm11 = (halfword_offset as u16) & 0x7FF;
2578                    let instr: u16 = 0xE000 | imm11;
2579                    Ok(instr.to_le_bytes().to_vec())
2580                } else {
2581                    // 32-bit B.W encoding for larger offsets
2582                    // First halfword: 1111 0 S imm10
2583                    // Second halfword: 10 J1 0 J2 imm11
2584                    // Total offset = SignExtend(S:I1:I2:imm10:imm11:0)
2585                    // where I1 = NOT(J1 XOR S), I2 = NOT(J2 XOR S)
2586
2587                    // The B.W (T4) encoding packs the signed offset as:
2588                    //   S:I1:I2:imm10:imm11:0  (25-bit signed, halfword-aligned)
2589                    // where J1 = NOT(I1 XOR S), J2 = NOT(I2 XOR S)
2590                    // Input halfword_offset already equals (target - PC - 4) / 2,
2591                    // so the full byte offset = halfword_offset << 1.
2592                    // The encoding fields split that 25-bit signed value (including the
2593                    // implicit trailing zero) as: S | imm10 | imm11
2594                    // with I1 = bit 23 and I2 = bit 22 of the signed offset.
2595                    let signed_offset = halfword_offset << 1; // byte offset
2596                    let s = if signed_offset < 0 { 1u32 } else { 0u32 };
2597                    let uoffset = signed_offset as u32;
2598                    let imm10 = (uoffset >> 12) & 0x3FF; // bits [21:12]
2599                    let imm11 = (uoffset >> 1) & 0x7FF; // bits [11:1]
2600                    let i1 = (uoffset >> 23) & 1; // bit 23
2601                    let i2 = (uoffset >> 22) & 1; // bit 22
2602                    let j1 = (!(i1 ^ s)) & 1; // J1 = NOT(I1 XOR S)
2603                    let j2 = (!(i2 ^ s)) & 1; // J2 = NOT(I2 XOR S)
2604
2605                    let hw1: u16 = (0xF000 | (s << 10) | imm10) as u16;
2606                    let hw2: u16 = (0x9000 | (j1 << 13) | (j2 << 11) | imm11) as u16;
2607
2608                    let mut bytes = hw1.to_le_bytes().to_vec();
2609                    bytes.extend_from_slice(&hw2.to_le_bytes());
2610                    Ok(bytes)
2611                }
2612            }
2613
2614            // Conditional branch with numeric offset (Thumb-2)
2615            ArmOp::BCondOffset { cond, offset } => {
2616                use synth_synthesis::Condition;
2617                let cond_bits: u16 = match cond {
2618                    Condition::EQ => 0x0,
2619                    Condition::NE => 0x1,
2620                    Condition::HS => 0x2,
2621                    Condition::LO => 0x3,
2622                    Condition::HI => 0x8,
2623                    Condition::LS => 0x9,
2624                    Condition::GE => 0xA,
2625                    Condition::LT => 0xB,
2626                    Condition::GT => 0xC,
2627                    Condition::LE => 0xD,
2628                };
2629
2630                // offset is already the halfword displacement: (target - branch - 4) / 2
2631                // This is the raw imm8 value for 16-bit B<cond> encoding
2632                let halfword_offset = *offset;
2633
2634                // 16-bit B<cond> encoding: 1101 cond imm8
2635                // Range: -256 to +254 halfwords (imm8 is sign-extended and shifted left 1)
2636                if (-128..=127).contains(&halfword_offset) {
2637                    let imm8 = (halfword_offset as u16) & 0xFF;
2638                    let instr: u16 = 0xD000 | (cond_bits << 8) | imm8;
2639                    Ok(instr.to_le_bytes().to_vec())
2640                } else {
2641                    // 32-bit B<cond>.W for larger offsets
2642                    // First halfword: 1111 0 S cond imm6
2643                    // Second halfword: 10 J1 0 J2 imm11
2644                    let offset = halfword_offset >> 1;
2645                    let s = if offset < 0 { 1u32 } else { 0u32 };
2646                    let imm6 = ((offset >> 11) as u32) & 0x3F;
2647                    let imm11 = (offset as u32) & 0x7FF;
2648                    let j1 = if s == 1 { 1 } else { 0 };
2649                    let j2 = if s == 1 { 1 } else { 0 };
2650
2651                    let hw1: u16 = (0xF000 | (s << 10) | ((cond_bits as u32) << 6) | imm6) as u16;
2652                    let hw2: u16 = (0x8000 | (j1 << 13) | (j2 << 11) | imm11) as u16;
2653
2654                    let mut bytes = hw1.to_le_bytes().to_vec();
2655                    bytes.extend_from_slice(&hw2.to_le_bytes());
2656                    Ok(bytes)
2657                }
2658            }
2659
2660            ArmOp::Bl { label: _ } => {
2661                // BL is always 32-bit in Thumb-2, encoded here as a relocatable
2662                // placeholder; an R_ARM_THM_CALL relocation patches the target
2663                // (see arm_backend.rs). The placeholder must carry an embedded
2664                // addend of -4 so the relocation nets to exactly the symbol S.
2665                //
2666                // Thumb BL computes `target = (P + 4) + signed_offset`. Under
2667                // R_ARM_THM_CALL the linker resolves using the in-place addend;
2668                // a 0xF800 placeholder (addend 0) lands at S+4 — every call one
2669                // instruction past the callee entry (#174). The correct
2670                // placeholder is what `gas` emits for `bl <extern>`:
2671                //   f7ff fffe  ->  `bl <self>`  (S=1, J1=J2=1, imm = -4 addend),
2672                // i.e. hw1=0xF7FF, hw2=0xFFFE. This nets to S, not S+4.
2673                // (The earlier 0xD000 was worse still — a ~+0x600000 addend,
2674                // the garbage `bl c0000c` and "truncated to fit" of #167.)
2675                let hw1: u16 = 0xF7FF;
2676                let hw2: u16 = 0xFFFE;
2677                let mut bytes = hw1.to_le_bytes().to_vec();
2678                bytes.extend_from_slice(&hw2.to_le_bytes());
2679                Ok(bytes)
2680            }
2681
2682            // MVN
2683            ArmOp::Mvn { rd, op2 } => {
2684                if let Operand2::Reg(rm) = op2 {
2685                    let rd_bits = reg_to_bits(rd) as u16;
2686                    let rm_bits = reg_to_bits(rm) as u16;
2687
2688                    if rd_bits < 8 && rm_bits < 8 {
2689                        // MVNS Rd, Rm (16-bit): 0100 0011 11 Rm Rd
2690                        let instr: u16 = 0x43C0 | (rm_bits << 3) | rd_bits;
2691                        Ok(instr.to_le_bytes().to_vec())
2692                    } else {
2693                        // 32-bit MVN
2694                        let hw1: u16 = 0xEA6F_u16;
2695                        let hw2: u16 = ((reg_to_bits(rd) << 8) | reg_to_bits(rm)) as u16;
2696                        let mut bytes = hw1.to_le_bytes().to_vec();
2697                        bytes.extend_from_slice(&hw2.to_le_bytes());
2698                        Ok(bytes)
2699                    }
2700                } else {
2701                    let instr: u16 = 0xBF00;
2702                    Ok(instr.to_le_bytes().to_vec())
2703                }
2704            }
2705
2706            // MOVW - Move Wide (Thumb-2 32-bit)
2707            ArmOp::Movw { rd, imm16 } => {
2708                self.encode_thumb32_movw_raw(reg_to_bits(rd), *imm16 as u32)
2709            }
2710
2711            // MOVT - Move Top (Thumb-2 32-bit)
2712            ArmOp::Movt { rd, imm16 } => {
2713                self.encode_thumb32_movt_raw(reg_to_bits(rd), *imm16 as u32)
2714            }
2715
2716            // SetCond: Materialize condition flag into register (0 or 1)
2717            // Strategy: ITE <cond>; MOV Rd, #1; MOV Rd, #0
2718            // IMPORTANT: Must use ITE (If-Then-Else) because 16-bit Thumb MOV
2719            // always sets flags (MOVS). We need to evaluate the condition BEFORE
2720            // any MOV instruction clobbers the flags from CMP.
2721            ArmOp::SetCond { rd, cond } => {
2722                let rd_bits = reg_to_bits(rd) as u16;
2723
2724                // Condition code encoding for IT block
2725                use synth_synthesis::Condition;
2726                let cond_bits: u16 = match cond {
2727                    Condition::EQ => 0x0,
2728                    Condition::NE => 0x1,
2729                    Condition::LT => 0xB,
2730                    Condition::LE => 0xD,
2731                    Condition::GT => 0xC,
2732                    Condition::GE => 0xA,
2733                    Condition::LO => 0x3, // CC/LO (unsigned <)
2734                    Condition::LS => 0x9, // LS (unsigned <=)
2735                    Condition::HI => 0x8, // HI (unsigned >)
2736                    Condition::HS => 0x2, // CS/HS (unsigned >=)
2737                };
2738
2739                // ITE <cond>: encodes If-Then-Else block
2740                // The mask field depends on firstcond[0]:
2741                // - If firstcond[0] = 0: mask = 0xC for TE pattern (ITE EQ = BF0C)
2742                // - If firstcond[0] = 1: mask = 0x4 for TE pattern (ITE NE = BF14)
2743                let mask = if (cond_bits & 1) == 0 { 0xC } else { 0x4 };
2744                let ite_instr: u16 = 0xBF00 | (cond_bits << 4) | mask;
2745
2746                // Materialize 0/1 into Rd. The 16-bit MOVS (T1) encodes Rd in a
2747                // 3-bit field (bits[10:8]) — only R0–R7. For a high register
2748                // (R8–R12) `rd_bits << 8` overflows into bit 11 and silently
2749                // turns MOVS into CMP (00100 → 00101), corrupting the result
2750                // (this mis-materialized gale's `has_waiter`, so its `local.set`
2751                // stored a stale register → the binary-sem WAKE dispatch read
2752                // garbage). Use the 32-bit MOV.W (T2) for high registers, which
2753                // has a 4-bit Rd field. MOV.W with S=0 doesn't set flags, which
2754                // is fine inside the ITE (the materialized value is the result;
2755                // the flags are not consumed afterwards).
2756                let mut bytes = ite_instr.to_le_bytes().to_vec();
2757                let push_mov = |bytes: &mut Vec<u8>, imm: u16| {
2758                    if rd_bits <= 7 {
2759                        let m: u16 = 0x2000 | (rd_bits << 8) | imm; // 16-bit MOVS Rd,#imm
2760                        bytes.extend_from_slice(&m.to_le_bytes());
2761                    } else {
2762                        // 32-bit MOV.W Rd, #imm (T2): F04F | (Rd<<8) | imm8
2763                        let hw1: u16 = 0xF04F;
2764                        let hw2: u16 = (rd_bits << 8) | imm;
2765                        bytes.extend_from_slice(&hw1.to_le_bytes());
2766                        bytes.extend_from_slice(&hw2.to_le_bytes());
2767                    }
2768                };
2769                push_mov(&mut bytes, 1); // Then branch (condition true)  → 1
2770                push_mov(&mut bytes, 0); // Else branch (condition false) → 0
2771                Ok(bytes)
2772            }
2773
2774            // I64SetCond: Compare two i64 register pairs, result 0/1 in rd
2775            // EQ/NE: CMP lo,lo; IT EQ; CMPEQ hi,hi; ITE <cond>; MOV 1; MOV 0
2776            // LT: CMP lo,lo; SBCS rd,hi,hi; ITE LT; MOV 1; MOV 0
2777            // GT: CMP lo,lo (swapped); SBCS rd,hi,hi (swapped); ITE LT; MOV 1; MOV 0
2778            ArmOp::I64SetCond {
2779                rd,
2780                rn_lo,
2781                rn_hi,
2782                rm_lo,
2783                rm_hi,
2784                cond,
2785            } => {
2786                use synth_synthesis::Condition;
2787                let rd_bits = reg_to_bits(rd) as u16;
2788                let mut bytes = Vec::new();
2789
2790                // Helper: encode CMP Rn, Rm (16-bit)
2791                let encode_cmp_reg = |rn: &synth_synthesis::Reg,
2792                                      rm: &synth_synthesis::Reg|
2793                 -> Vec<u8> {
2794                    let rn_bits = reg_to_bits(rn) as u16;
2795                    let rm_bits = reg_to_bits(rm) as u16;
2796                    if rn_bits < 8 && rm_bits < 8 {
2797                        let instr: u16 = 0x4280 | (rm_bits << 3) | rn_bits;
2798                        instr.to_le_bytes().to_vec()
2799                    } else {
2800                        let n_bit = (rn_bits >> 3) & 1;
2801                        let instr: u16 = 0x4500 | (n_bit << 7) | (rm_bits << 3) | (rn_bits & 0x7);
2802                        instr.to_le_bytes().to_vec()
2803                    }
2804                };
2805
2806                // Helper: encode ITE <cond> (2 bytes)
2807                let encode_ite = |cond_bits: u16| -> Vec<u8> {
2808                    let mask = if (cond_bits & 1) == 0 { 0xC } else { 0x4 };
2809                    let ite_instr: u16 = 0xBF00 | (cond_bits << 4) | mask;
2810                    ite_instr.to_le_bytes().to_vec()
2811                };
2812
2813                // Helper: encode SetCond (ITE + MOV #1 + MOV #0) for given condition
2814                let encode_setcond = |cond_bits: u16, rd_bits: u16| -> Vec<u8> {
2815                    let mut b = encode_ite(cond_bits);
2816                    let mov_one: u16 = 0x2001 | (rd_bits << 8);
2817                    let mov_zero: u16 = 0x2000 | (rd_bits << 8);
2818                    b.extend_from_slice(&mov_one.to_le_bytes());
2819                    b.extend_from_slice(&mov_zero.to_le_bytes());
2820                    b
2821                };
2822
2823                match cond {
2824                    Condition::EQ | Condition::NE => {
2825                        // CMP rn_lo, rm_lo (compare low words)
2826                        bytes.extend_from_slice(&encode_cmp_reg(rn_lo, rm_lo));
2827
2828                        // IT EQ (execute next instruction only if Z=1)
2829                        let it_eq: u16 = 0xBF08; // IT EQ: cond=0000, mask=1000
2830                        bytes.extend_from_slice(&it_eq.to_le_bytes());
2831
2832                        // CMPEQ rn_hi, rm_hi (compare high words, only if low equal)
2833                        bytes.extend_from_slice(&encode_cmp_reg(rn_hi, rm_hi));
2834
2835                        // ITE <cond>; MOV rd, #1; MOV rd, #0
2836                        let cond_bits: u16 = match cond {
2837                            Condition::EQ => 0x0,
2838                            Condition::NE => 0x1,
2839                            _ => unreachable!(),
2840                        };
2841                        bytes.extend_from_slice(&encode_setcond(cond_bits, rd_bits));
2842                    }
2843
2844                    Condition::LT => {
2845                        // CMP rn_lo, rm_lo (sets C flag for borrow)
2846                        bytes.extend_from_slice(&encode_cmp_reg(rn_lo, rm_lo));
2847
2848                        // SBCS rd, rn_hi, rm_hi (subtract with carry, sets N,V flags)
2849                        // SBCS.W Rd, Rn, Rm: EB70 Rn | 0000 Rd 0000 Rm
2850                        let rn_hi_bits = reg_to_bits(rn_hi);
2851                        let rm_hi_bits = reg_to_bits(rm_hi);
2852                        let hw1: u16 = (0xEB70 | rn_hi_bits) as u16;
2853                        let hw2: u16 = ((rd_bits as u32) << 8 | rm_hi_bits) as u16;
2854                        bytes.extend_from_slice(&hw1.to_le_bytes());
2855                        bytes.extend_from_slice(&hw2.to_le_bytes());
2856
2857                        // ITE LT; MOV rd, #1; MOV rd, #0
2858                        bytes.extend_from_slice(&encode_setcond(0xB, rd_bits)); // LT = 0xB
2859                    }
2860
2861                    Condition::GT => {
2862                        // GT(a,b) = LT(b,a): swap operands
2863                        // CMP rm_lo, rn_lo (swapped)
2864                        bytes.extend_from_slice(&encode_cmp_reg(rm_lo, rn_lo));
2865
2866                        // SBCS rd, rm_hi, rn_hi (swapped)
2867                        let rm_hi_bits = reg_to_bits(rm_hi);
2868                        let rn_hi_bits = reg_to_bits(rn_hi);
2869                        let hw1: u16 = (0xEB70 | rm_hi_bits) as u16;
2870                        let hw2: u16 = ((rd_bits as u32) << 8 | rn_hi_bits) as u16;
2871                        bytes.extend_from_slice(&hw1.to_le_bytes());
2872                        bytes.extend_from_slice(&hw2.to_le_bytes());
2873
2874                        // ITE LT; MOV rd, #1; MOV rd, #0
2875                        bytes.extend_from_slice(&encode_setcond(0xB, rd_bits)); // LT = 0xB
2876                    }
2877
2878                    Condition::LE => {
2879                        // LE(a,b) = !GT(a,b): use GT logic but invert result
2880                        // GT(a,b) = LT(b,a): so we do CMP(b,a) and check LT, then invert
2881                        // CMP rm_lo, rn_lo (swapped, same as GT)
2882                        bytes.extend_from_slice(&encode_cmp_reg(rm_lo, rn_lo));
2883
2884                        // SBCS rd, rm_hi, rn_hi (swapped)
2885                        let rm_hi_bits = reg_to_bits(rm_hi);
2886                        let rn_hi_bits = reg_to_bits(rn_hi);
2887                        let hw1: u16 = (0xEB70 | rm_hi_bits) as u16;
2888                        let hw2: u16 = ((rd_bits as u32) << 8 | rn_hi_bits) as u16;
2889                        bytes.extend_from_slice(&hw1.to_le_bytes());
2890                        bytes.extend_from_slice(&hw2.to_le_bytes());
2891
2892                        // ITE GE; MOV rd, #1; MOV rd, #0 (GE is !LT, so inverting GT result)
2893                        bytes.extend_from_slice(&encode_setcond(0xA, rd_bits)); // GE = 0xA
2894                    }
2895
2896                    Condition::GE => {
2897                        // GE(a,b) = !LT(a,b): use LT logic but invert result
2898                        // CMP rn_lo, rm_lo (same as LT)
2899                        bytes.extend_from_slice(&encode_cmp_reg(rn_lo, rm_lo));
2900
2901                        // SBCS rd, rn_hi, rm_hi (same as LT)
2902                        let rn_hi_bits = reg_to_bits(rn_hi);
2903                        let rm_hi_bits = reg_to_bits(rm_hi);
2904                        let hw1: u16 = (0xEB70 | rn_hi_bits) as u16;
2905                        let hw2: u16 = ((rd_bits as u32) << 8 | rm_hi_bits) as u16;
2906                        bytes.extend_from_slice(&hw1.to_le_bytes());
2907                        bytes.extend_from_slice(&hw2.to_le_bytes());
2908
2909                        // ITE GE; MOV rd, #1; MOV rd, #0 (GE is !LT)
2910                        bytes.extend_from_slice(&encode_setcond(0xA, rd_bits)); // GE = 0xA
2911                    }
2912
2913                    // Unsigned comparisons - same instruction sequence, different conditions
2914                    Condition::LO => {
2915                        // LO (unsigned LT): CMP lo, SBCS hi, check C=0
2916                        bytes.extend_from_slice(&encode_cmp_reg(rn_lo, rm_lo));
2917                        let rn_hi_bits = reg_to_bits(rn_hi);
2918                        let rm_hi_bits = reg_to_bits(rm_hi);
2919                        let hw1: u16 = (0xEB70 | rn_hi_bits) as u16;
2920                        let hw2: u16 = ((rd_bits as u32) << 8 | rm_hi_bits) as u16;
2921                        bytes.extend_from_slice(&hw1.to_le_bytes());
2922                        bytes.extend_from_slice(&hw2.to_le_bytes());
2923                        bytes.extend_from_slice(&encode_setcond(0x3, rd_bits)); // LO = 0x3 (CC)
2924                    }
2925
2926                    Condition::HI => {
2927                        // HI (unsigned GT): swap operands and check LO
2928                        bytes.extend_from_slice(&encode_cmp_reg(rm_lo, rn_lo));
2929                        let rm_hi_bits = reg_to_bits(rm_hi);
2930                        let rn_hi_bits = reg_to_bits(rn_hi);
2931                        let hw1: u16 = (0xEB70 | rm_hi_bits) as u16;
2932                        let hw2: u16 = ((rd_bits as u32) << 8 | rn_hi_bits) as u16;
2933                        bytes.extend_from_slice(&hw1.to_le_bytes());
2934                        bytes.extend_from_slice(&hw2.to_le_bytes());
2935                        bytes.extend_from_slice(&encode_setcond(0x3, rd_bits)); // LO = 0x3 (CC)
2936                    }
2937
2938                    Condition::LS => {
2939                        // LS (unsigned LE): !(a > b) = !(HI), so do HI and invert
2940                        bytes.extend_from_slice(&encode_cmp_reg(rm_lo, rn_lo));
2941                        let rm_hi_bits = reg_to_bits(rm_hi);
2942                        let rn_hi_bits = reg_to_bits(rn_hi);
2943                        let hw1: u16 = (0xEB70 | rm_hi_bits) as u16;
2944                        let hw2: u16 = ((rd_bits as u32) << 8 | rn_hi_bits) as u16;
2945                        bytes.extend_from_slice(&hw1.to_le_bytes());
2946                        bytes.extend_from_slice(&hw2.to_le_bytes());
2947                        bytes.extend_from_slice(&encode_setcond(0x2, rd_bits)); // HS = 0x2 (CS) = !LO
2948                    }
2949
2950                    Condition::HS => {
2951                        // HS (unsigned GE): !(a < b) = !(LO)
2952                        bytes.extend_from_slice(&encode_cmp_reg(rn_lo, rm_lo));
2953                        let rn_hi_bits = reg_to_bits(rn_hi);
2954                        let rm_hi_bits = reg_to_bits(rm_hi);
2955                        let hw1: u16 = (0xEB70 | rn_hi_bits) as u16;
2956                        let hw2: u16 = ((rd_bits as u32) << 8 | rm_hi_bits) as u16;
2957                        bytes.extend_from_slice(&hw1.to_le_bytes());
2958                        bytes.extend_from_slice(&hw2.to_le_bytes());
2959                        bytes.extend_from_slice(&encode_setcond(0x2, rd_bits)); // HS = 0x2 (CS) = !LO
2960                    }
2961                }
2962
2963                Ok(bytes)
2964            }
2965
2966            // I64SetCondZ: Test if i64 register pair is zero, result 0/1 in rd
2967            // ORR.W rd, rn_lo, rn_hi; CMP rd, #0; ITE EQ; MOV 1; MOV 0
2968            ArmOp::I64SetCondZ { rd, rn_lo, rn_hi } => {
2969                let rd_bits = reg_to_bits(rd);
2970                let rn_lo_bits = reg_to_bits(rn_lo);
2971                let rn_hi_bits = reg_to_bits(rn_hi);
2972                let mut bytes = Vec::new();
2973
2974                // ORR.W rd, rn_lo, rn_hi: EA40 rn_lo | 0000 rd 0000 rn_hi
2975                let hw1: u16 = (0xEA40 | rn_lo_bits) as u16;
2976                let hw2: u16 = ((rd_bits << 8) | rn_hi_bits) as u16;
2977                bytes.extend_from_slice(&hw1.to_le_bytes());
2978                bytes.extend_from_slice(&hw2.to_le_bytes());
2979
2980                // CMP rd, #0 (16-bit): 0010 1 Rd 0000 0000
2981                let cmp_instr: u16 = 0x2800 | ((rd_bits as u16) << 8);
2982                bytes.extend_from_slice(&cmp_instr.to_le_bytes());
2983
2984                // ITE EQ; MOV rd, #1; MOV rd, #0
2985                let mask = 0xC_u16; // ITE EQ mask: firstcond[0]=0, mask=0xC
2986                let ite_instr: u16 = 0xBF00 | mask;
2987                bytes.extend_from_slice(&ite_instr.to_le_bytes());
2988                let mov_one: u16 = 0x2001 | ((rd_bits as u16) << 8);
2989                let mov_zero: u16 = 0x2000 | ((rd_bits as u16) << 8);
2990                bytes.extend_from_slice(&mov_one.to_le_bytes());
2991                bytes.extend_from_slice(&mov_zero.to_le_bytes());
2992
2993                Ok(bytes)
2994            }
2995
2996            // I64Mul: 64-bit multiply using UMULL + MLA cross products
2997            // Formula: result = (a_lo * b_lo) + ((a_lo * b_hi + a_hi * b_lo) << 32)
2998            // Uses R12 as scratch register
2999            ArmOp::I64Mul {
3000                rd_lo,
3001                rd_hi,
3002                rn_lo,
3003                rn_hi,
3004                rm_lo,
3005                rm_hi,
3006            } => {
3007                let rd_lo_bits = reg_to_bits(rd_lo);
3008                let rd_hi_bits = reg_to_bits(rd_hi);
3009                let rn_lo_bits = reg_to_bits(rn_lo);
3010                let rn_hi_bits = reg_to_bits(rn_hi);
3011                let rm_lo_bits = reg_to_bits(rm_lo);
3012                let rm_hi_bits = reg_to_bits(rm_hi);
3013                let r12: u32 = 12; // IP scratch register
3014                let mut bytes = Vec::new();
3015
3016                // 1. MUL R12, rn_lo, rm_hi  (R12 = a_lo * b_hi)
3017                // Thumb-2 MUL: hw1=0xFB00|Rn, hw2=0xF000|(Rd<<8)|Rm
3018                let hw1: u16 = (0xFB00 | rn_lo_bits) as u16;
3019                let hw2: u16 = (0xF000 | (r12 << 8) | rm_hi_bits) as u16;
3020                bytes.extend_from_slice(&hw1.to_le_bytes());
3021                bytes.extend_from_slice(&hw2.to_le_bytes());
3022
3023                // 2. MLA R12, rn_hi, rm_lo, R12  (R12 += a_hi * b_lo)
3024                // Thumb-2 MLA: hw1=0xFB00|Rn, hw2=(Ra<<12)|(Rd<<8)|Rm
3025                let hw1: u16 = (0xFB00 | rn_hi_bits) as u16;
3026                let hw2: u16 = ((r12 << 12) | (r12 << 8) | rm_lo_bits) as u16;
3027                bytes.extend_from_slice(&hw1.to_le_bytes());
3028                bytes.extend_from_slice(&hw2.to_le_bytes());
3029
3030                // 3. UMULL rd_lo, rd_hi, rn_lo, rm_lo  (rd_lo:rd_hi = a_lo * b_lo)
3031                // Thumb-2 UMULL: hw1=0xFBA0|Rn, hw2=(RdLo<<12)|(RdHi<<8)|Rm
3032                let hw1: u16 = (0xFBA0 | rn_lo_bits) as u16;
3033                let hw2: u16 = ((rd_lo_bits << 12) | (rd_hi_bits << 8) | rm_lo_bits) as u16;
3034                bytes.extend_from_slice(&hw1.to_le_bytes());
3035                bytes.extend_from_slice(&hw2.to_le_bytes());
3036
3037                // 4. ADD rd_hi, R12  (rd_hi += cross products)
3038                // 16-bit high reg ADD: 01000100 D Rm Rdn[2:0]
3039                let d_bit = (rd_hi_bits >> 3) & 1;
3040                let add_instr: u16 =
3041                    (0x4400 | (d_bit << 7) | (r12 << 3) | (rd_hi_bits & 0x7)) as u16;
3042                bytes.extend_from_slice(&add_instr.to_le_bytes());
3043
3044                Ok(bytes)
3045            }
3046
3047            // I64Shl: 64-bit shift left with branch for n<32 vs n>=32
3048            // rm_hi (R3) is used as temp register
3049            ArmOp::I64Shl {
3050                rd_lo,
3051                rd_hi,
3052                rn_lo,
3053                rn_hi,
3054                rm_lo,
3055                rm_hi,
3056            } => {
3057                let rd_lo_bits = reg_to_bits(rd_lo);
3058                let rd_hi_bits = reg_to_bits(rd_hi);
3059                let rn_lo_bits = reg_to_bits(rn_lo);
3060                let rn_hi_bits = reg_to_bits(rn_hi);
3061                let rm_lo_bits = reg_to_bits(rm_lo);
3062                let rm_hi_bits = reg_to_bits(rm_hi); // temp
3063                let mut bytes = Vec::new();
3064
3065                // AND.W rm_lo, rm_lo, #63  (mask shift amount to 6 bits)
3066                let hw1: u16 = (0xF000 | rm_lo_bits) as u16;
3067                let hw2: u16 = ((rm_lo_bits << 8) | 0x3F) as u16;
3068                bytes.extend_from_slice(&hw1.to_le_bytes());
3069                bytes.extend_from_slice(&hw2.to_le_bytes());
3070
3071                // SUBS.W rm_hi, rm_lo, #32  (rm_hi = n-32, sets flags)
3072                let hw1: u16 = (0xF1B0 | rm_lo_bits) as u16;
3073                let hw2: u16 = ((rm_hi_bits << 8) | 0x20) as u16;
3074                bytes.extend_from_slice(&hw1.to_le_bytes());
3075                bytes.extend_from_slice(&hw2.to_le_bytes());
3076
3077                // BPL .large (branch if n >= 32, offset = +10 halfwords)
3078                let bpl: u16 = 0xD50A;
3079                bytes.extend_from_slice(&bpl.to_le_bytes());
3080
3081                // --- Small shift (n < 32) ---
3082                // RSB.W rm_hi, rm_lo, #32  (rm_hi = 32-n)
3083                let hw1: u16 = (0xF1C0 | rm_lo_bits) as u16;
3084                let hw2: u16 = ((rm_hi_bits << 8) | 0x20) as u16;
3085                bytes.extend_from_slice(&hw1.to_le_bytes());
3086                bytes.extend_from_slice(&hw2.to_le_bytes());
3087
3088                // LSR.W rm_hi, rn_lo, rm_hi  (rm_hi = lo >> (32-n), overflow bits)
3089                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3090                let hw2: u16 = (0xF000 | (rm_hi_bits << 8) | rm_hi_bits) as u16;
3091                bytes.extend_from_slice(&hw1.to_le_bytes());
3092                bytes.extend_from_slice(&hw2.to_le_bytes());
3093
3094                // LSL.W rd_hi, rn_hi, rm_lo  (hi <<= n)
3095                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3096                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | rm_lo_bits) as u16;
3097                bytes.extend_from_slice(&hw1.to_le_bytes());
3098                bytes.extend_from_slice(&hw2.to_le_bytes());
3099
3100                // ORR.W rd_hi, rd_hi, rm_hi  (hi |= overflow bits from lo)
3101                let hw1: u16 = (0xEA40 | rd_hi_bits) as u16;
3102                let hw2: u16 = ((rd_hi_bits << 8) | rm_hi_bits) as u16;
3103                bytes.extend_from_slice(&hw1.to_le_bytes());
3104                bytes.extend_from_slice(&hw2.to_le_bytes());
3105
3106                // LSL.W rd_lo, rn_lo, rm_lo  (lo <<= n)
3107                let hw1: u16 = (0xFA00 | rn_lo_bits) as u16;
3108                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | rm_lo_bits) as u16;
3109                bytes.extend_from_slice(&hw1.to_le_bytes());
3110                bytes.extend_from_slice(&hw2.to_le_bytes());
3111
3112                // B .done (skip large shift: +2 halfwords)
3113                let b_done: u16 = 0xE002;
3114                bytes.extend_from_slice(&b_done.to_le_bytes());
3115
3116                // --- Large shift (n >= 32) ---
3117                // LSL.W rd_hi, rn_lo, rm_hi  (hi = lo << (n-32))
3118                let hw1: u16 = (0xFA00 | rn_lo_bits) as u16;
3119                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | rm_hi_bits) as u16;
3120                bytes.extend_from_slice(&hw1.to_le_bytes());
3121                bytes.extend_from_slice(&hw2.to_le_bytes());
3122
3123                // MOV rd_lo, #0
3124                let mov_zero: u16 = 0x2000 | ((rd_lo_bits as u16) << 8);
3125                bytes.extend_from_slice(&mov_zero.to_le_bytes());
3126
3127                Ok(bytes) // Total: 38 bytes
3128            }
3129
3130            // I64ShrU: 64-bit logical shift right with branch for n<32 vs n>=32
3131            ArmOp::I64ShrU {
3132                rd_lo,
3133                rd_hi,
3134                rn_lo,
3135                rn_hi,
3136                rm_lo,
3137                rm_hi,
3138            } => {
3139                let rd_lo_bits = reg_to_bits(rd_lo);
3140                let rd_hi_bits = reg_to_bits(rd_hi);
3141                let rn_lo_bits = reg_to_bits(rn_lo);
3142                let rn_hi_bits = reg_to_bits(rn_hi);
3143                let rm_lo_bits = reg_to_bits(rm_lo);
3144                let rm_hi_bits = reg_to_bits(rm_hi); // temp
3145                let mut bytes = Vec::new();
3146
3147                // AND.W rm_lo, rm_lo, #63
3148                let hw1: u16 = (0xF000 | rm_lo_bits) as u16;
3149                let hw2: u16 = ((rm_lo_bits << 8) | 0x3F) as u16;
3150                bytes.extend_from_slice(&hw1.to_le_bytes());
3151                bytes.extend_from_slice(&hw2.to_le_bytes());
3152
3153                // SUBS.W rm_hi, rm_lo, #32
3154                let hw1: u16 = (0xF1B0 | rm_lo_bits) as u16;
3155                let hw2: u16 = ((rm_hi_bits << 8) | 0x20) as u16;
3156                bytes.extend_from_slice(&hw1.to_le_bytes());
3157                bytes.extend_from_slice(&hw2.to_le_bytes());
3158
3159                // BPL .large (+10 halfwords)
3160                let bpl: u16 = 0xD50A;
3161                bytes.extend_from_slice(&bpl.to_le_bytes());
3162
3163                // --- Small shift (n < 32) ---
3164                // RSB.W rm_hi, rm_lo, #32  (rm_hi = 32-n)
3165                let hw1: u16 = (0xF1C0 | rm_lo_bits) as u16;
3166                let hw2: u16 = ((rm_hi_bits << 8) | 0x20) as u16;
3167                bytes.extend_from_slice(&hw1.to_le_bytes());
3168                bytes.extend_from_slice(&hw2.to_le_bytes());
3169
3170                // LSL.W rm_hi, rn_hi, rm_hi  (rm_hi = hi << (32-n), bits flowing to lo)
3171                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3172                let hw2: u16 = (0xF000 | (rm_hi_bits << 8) | rm_hi_bits) as u16;
3173                bytes.extend_from_slice(&hw1.to_le_bytes());
3174                bytes.extend_from_slice(&hw2.to_le_bytes());
3175
3176                // LSR.W rd_lo, rn_lo, rm_lo  (lo >>= n)
3177                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3178                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | rm_lo_bits) as u16;
3179                bytes.extend_from_slice(&hw1.to_le_bytes());
3180                bytes.extend_from_slice(&hw2.to_le_bytes());
3181
3182                // ORR.W rd_lo, rd_lo, rm_hi  (lo |= overflow from hi)
3183                let hw1: u16 = (0xEA40 | rd_lo_bits) as u16;
3184                let hw2: u16 = ((rd_lo_bits << 8) | rm_hi_bits) as u16;
3185                bytes.extend_from_slice(&hw1.to_le_bytes());
3186                bytes.extend_from_slice(&hw2.to_le_bytes());
3187
3188                // LSR.W rd_hi, rn_hi, rm_lo  (hi >>= n, logical)
3189                let hw1: u16 = (0xFA20 | rn_hi_bits) as u16;
3190                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | rm_lo_bits) as u16;
3191                bytes.extend_from_slice(&hw1.to_le_bytes());
3192                bytes.extend_from_slice(&hw2.to_le_bytes());
3193
3194                // B .done (+2 halfwords)
3195                let b_done: u16 = 0xE002;
3196                bytes.extend_from_slice(&b_done.to_le_bytes());
3197
3198                // --- Large shift (n >= 32) ---
3199                // LSR.W rd_lo, rn_hi, rm_hi  (lo = hi >> (n-32))
3200                let hw1: u16 = (0xFA20 | rn_hi_bits) as u16;
3201                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | rm_hi_bits) as u16;
3202                bytes.extend_from_slice(&hw1.to_le_bytes());
3203                bytes.extend_from_slice(&hw2.to_le_bytes());
3204
3205                // MOV rd_hi, #0
3206                let mov_zero: u16 = 0x2000 | ((rd_hi_bits as u16) << 8);
3207                bytes.extend_from_slice(&mov_zero.to_le_bytes());
3208
3209                Ok(bytes) // Total: 38 bytes
3210            }
3211
3212            // I64ShrS: 64-bit arithmetic shift right with branch for n<32 vs n>=32
3213            ArmOp::I64ShrS {
3214                rd_lo,
3215                rd_hi,
3216                rn_lo,
3217                rn_hi,
3218                rm_lo,
3219                rm_hi,
3220            } => {
3221                let rd_lo_bits = reg_to_bits(rd_lo);
3222                let rd_hi_bits = reg_to_bits(rd_hi);
3223                let rn_lo_bits = reg_to_bits(rn_lo);
3224                let rn_hi_bits = reg_to_bits(rn_hi);
3225                let rm_lo_bits = reg_to_bits(rm_lo);
3226                let rm_hi_bits = reg_to_bits(rm_hi); // temp
3227                let mut bytes = Vec::new();
3228
3229                // AND.W rm_lo, rm_lo, #63
3230                let hw1: u16 = (0xF000 | rm_lo_bits) as u16;
3231                let hw2: u16 = ((rm_lo_bits << 8) | 0x3F) as u16;
3232                bytes.extend_from_slice(&hw1.to_le_bytes());
3233                bytes.extend_from_slice(&hw2.to_le_bytes());
3234
3235                // SUBS.W rm_hi, rm_lo, #32
3236                let hw1: u16 = (0xF1B0 | rm_lo_bits) as u16;
3237                let hw2: u16 = ((rm_hi_bits << 8) | 0x20) as u16;
3238                bytes.extend_from_slice(&hw1.to_le_bytes());
3239                bytes.extend_from_slice(&hw2.to_le_bytes());
3240
3241                // BPL .large (+10 halfwords)
3242                let bpl: u16 = 0xD50A;
3243                bytes.extend_from_slice(&bpl.to_le_bytes());
3244
3245                // --- Small shift (n < 32) ---
3246                // RSB.W rm_hi, rm_lo, #32
3247                let hw1: u16 = (0xF1C0 | rm_lo_bits) as u16;
3248                let hw2: u16 = ((rm_hi_bits << 8) | 0x20) as u16;
3249                bytes.extend_from_slice(&hw1.to_le_bytes());
3250                bytes.extend_from_slice(&hw2.to_le_bytes());
3251
3252                // LSL.W rm_hi, rn_hi, rm_hi  (rm_hi = hi << (32-n), bits flowing to lo)
3253                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3254                let hw2: u16 = (0xF000 | (rm_hi_bits << 8) | rm_hi_bits) as u16;
3255                bytes.extend_from_slice(&hw1.to_le_bytes());
3256                bytes.extend_from_slice(&hw2.to_le_bytes());
3257
3258                // LSR.W rd_lo, rn_lo, rm_lo  (lo >>= n, logical for lo word)
3259                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3260                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | rm_lo_bits) as u16;
3261                bytes.extend_from_slice(&hw1.to_le_bytes());
3262                bytes.extend_from_slice(&hw2.to_le_bytes());
3263
3264                // ORR.W rd_lo, rd_lo, rm_hi  (lo |= overflow from hi)
3265                let hw1: u16 = (0xEA40 | rd_lo_bits) as u16;
3266                let hw2: u16 = ((rd_lo_bits << 8) | rm_hi_bits) as u16;
3267                bytes.extend_from_slice(&hw1.to_le_bytes());
3268                bytes.extend_from_slice(&hw2.to_le_bytes());
3269
3270                // ASR.W rd_hi, rn_hi, rm_lo  (hi >>= n, arithmetic/sign-extending)
3271                let hw1: u16 = (0xFA40 | rn_hi_bits) as u16;
3272                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | rm_lo_bits) as u16;
3273                bytes.extend_from_slice(&hw1.to_le_bytes());
3274                bytes.extend_from_slice(&hw2.to_le_bytes());
3275
3276                // B .done (+3 halfwords, large shift is 8 bytes)
3277                let b_done: u16 = 0xE003;
3278                bytes.extend_from_slice(&b_done.to_le_bytes());
3279
3280                // --- Large shift (n >= 32) ---
3281                // ASR.W rd_lo, rn_hi, rm_hi  (lo = hi >>> (n-32))
3282                let hw1: u16 = (0xFA40 | rn_hi_bits) as u16;
3283                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | rm_hi_bits) as u16;
3284                bytes.extend_from_slice(&hw1.to_le_bytes());
3285                bytes.extend_from_slice(&hw2.to_le_bytes());
3286
3287                // ASR.W rd_hi, rn_hi, #31  (hi = sign extension, all 0s or all 1s)
3288                // Thumb-2 ASR immediate: hw1=0xEA4F, hw2=imm3:Rd:imm2:10:Rm
3289                // imm5=31=11111 → imm3=111, imm2=11
3290                let hw1: u16 = 0xEA4F;
3291                let hw2: u16 = (0x7000 | (rd_hi_bits << 8) | 0x00E0 | rn_hi_bits) as u16;
3292                bytes.extend_from_slice(&hw1.to_le_bytes());
3293                bytes.extend_from_slice(&hw2.to_le_bytes());
3294
3295                Ok(bytes) // Total: 40 bytes
3296            }
3297
3298            // I64Rotl: 64-bit rotate left
3299            // For n < 32: new_hi = (hi << n) | (lo >> (32-n)), new_lo = (lo << n) | (hi >> (32-n))
3300            // For n >= 32: same formula but with lo/hi conceptually swapped, shift by (n-32)
3301            // Uses R4 (saved/restored) and R12 as scratch
3302            ArmOp::I64Rotl {
3303                rdlo,
3304                rdhi,
3305                rnlo,
3306                rnhi,
3307                shift,
3308            } => {
3309                let rd_lo_bits = reg_to_bits(rdlo);
3310                let rd_hi_bits = reg_to_bits(rdhi);
3311                let rn_lo_bits = reg_to_bits(rnlo);
3312                let rn_hi_bits = reg_to_bits(rnhi);
3313                let shift_bits = reg_to_bits(shift);
3314                let r12: u32 = 12; // IP scratch
3315                let r3: u32 = 3; // Scratch (high word of shift amount, unused)
3316                let r4: u32 = 4; // Scratch (saved/restored)
3317                let mut bytes = Vec::new();
3318
3319                // PUSH {R4}
3320                bytes.extend_from_slice(&0xB410u16.to_le_bytes());
3321
3322                // AND.W shift, shift, #63 (mask to 6 bits)
3323                let hw1: u16 = (0xF000 | shift_bits) as u16;
3324                let hw2: u16 = ((shift_bits << 8) | 0x3F) as u16;
3325                bytes.extend_from_slice(&hw1.to_le_bytes());
3326                bytes.extend_from_slice(&hw2.to_le_bytes());
3327
3328                // SUBS.W R3, shift, #32 (R3 = n-32, sets flags)
3329                let hw1: u16 = (0xF1B0 | shift_bits) as u16;
3330                let hw2: u16 = ((r3 << 8) | 0x20) as u16;
3331                bytes.extend_from_slice(&hw1.to_le_bytes());
3332                bytes.extend_from_slice(&hw2.to_le_bytes());
3333
3334                // BPL .large (branch if n >= 32, offset = +14 halfwords)
3335                let bpl: u16 = 0xD50E;
3336                bytes.extend_from_slice(&bpl.to_le_bytes());
3337
3338                // === Small rotation (n < 32) ===
3339                // RSB.W R3, shift, #32 (R3 = 32-n)
3340                let hw1: u16 = (0xF1C0 | shift_bits) as u16;
3341                let hw2: u16 = ((r3 << 8) | 0x20) as u16;
3342                bytes.extend_from_slice(&hw1.to_le_bytes());
3343                bytes.extend_from_slice(&hw2.to_le_bytes());
3344
3345                // LSR.W R4, rn_lo, R3 (R4 = lo >> (32-n), will go to new_hi)
3346                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3347                let hw2: u16 = (0xF000 | (r4 << 8) | r3) as u16;
3348                bytes.extend_from_slice(&hw1.to_le_bytes());
3349                bytes.extend_from_slice(&hw2.to_le_bytes());
3350
3351                // LSR.W R12, rn_hi, R3 (R12 = hi >> (32-n), will go to new_lo)
3352                let hw1: u16 = (0xFA20 | rn_hi_bits) as u16;
3353                let hw2: u16 = (0xF000 | (r12 << 8) | r3) as u16;
3354                bytes.extend_from_slice(&hw1.to_le_bytes());
3355                bytes.extend_from_slice(&hw2.to_le_bytes());
3356
3357                // LSL.W rd_hi, rn_hi, shift (rd_hi = hi << n)
3358                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3359                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | shift_bits) as u16;
3360                bytes.extend_from_slice(&hw1.to_le_bytes());
3361                bytes.extend_from_slice(&hw2.to_le_bytes());
3362
3363                // ORR.W rd_hi, rd_hi, R4 (rd_hi = (hi << n) | (lo >> (32-n)))
3364                let hw1: u16 = (0xEA40 | rd_hi_bits) as u16;
3365                let hw2: u16 = ((rd_hi_bits << 8) | r4) as u16;
3366                bytes.extend_from_slice(&hw1.to_le_bytes());
3367                bytes.extend_from_slice(&hw2.to_le_bytes());
3368
3369                // LSL.W rd_lo, rn_lo, shift (rd_lo = lo << n)
3370                let hw1: u16 = (0xFA00 | rn_lo_bits) as u16;
3371                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | shift_bits) as u16;
3372                bytes.extend_from_slice(&hw1.to_le_bytes());
3373                bytes.extend_from_slice(&hw2.to_le_bytes());
3374
3375                // ORR.W rd_lo, rd_lo, R12 (rd_lo = (lo << n) | (hi >> (32-n)))
3376                let hw1: u16 = (0xEA40 | rd_lo_bits) as u16;
3377                let hw2: u16 = ((rd_lo_bits << 8) | r12) as u16;
3378                bytes.extend_from_slice(&hw1.to_le_bytes());
3379                bytes.extend_from_slice(&hw2.to_le_bytes());
3380
3381                // B .done (skip large block, offset = +14 halfwords)
3382                let b_done: u16 = 0xE00E;
3383                bytes.extend_from_slice(&b_done.to_le_bytes());
3384
3385                // === Large rotation (n >= 32) ===
3386                // R3 already has n-32 from the SUBS
3387                // RSB.W R4, R3, #32 (R4 = 32-(n-32) = 64-n)
3388                let hw1: u16 = (0xF1C0 | r3) as u16;
3389                let hw2: u16 = ((r4 << 8) | 0x20) as u16;
3390                bytes.extend_from_slice(&hw1.to_le_bytes());
3391                bytes.extend_from_slice(&hw2.to_le_bytes());
3392
3393                // LSR.W R12, rn_hi, R4 (R12 = hi >> (64-n), goes to new_hi low bits)
3394                let hw1: u16 = (0xFA20 | rn_hi_bits) as u16;
3395                let hw2: u16 = (0xF000 | (r12 << 8) | r4) as u16;
3396                bytes.extend_from_slice(&hw1.to_le_bytes());
3397                bytes.extend_from_slice(&hw2.to_le_bytes());
3398
3399                // LSR.W R4, rn_lo, R4 (R4 = lo >> (64-n), goes to new_lo low bits)
3400                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3401                let hw2: u16 = (0xF000 | (r4 << 8) | r4) as u16;
3402                bytes.extend_from_slice(&hw1.to_le_bytes());
3403                bytes.extend_from_slice(&hw2.to_le_bytes());
3404
3405                // LSL.W shift, rn_lo, R3 (shift = lo << (n-32), new_hi high bits)
3406                let hw1: u16 = (0xFA00 | rn_lo_bits) as u16;
3407                let hw2: u16 = (0xF000 | (shift_bits << 8) | r3) as u16;
3408                bytes.extend_from_slice(&hw1.to_le_bytes());
3409                bytes.extend_from_slice(&hw2.to_le_bytes());
3410
3411                // ORR.W shift, shift, R12 (shift = (lo << (n-32)) | (hi >> (64-n)) = new_hi)
3412                let hw1: u16 = (0xEA40 | shift_bits) as u16;
3413                let hw2: u16 = ((shift_bits << 8) | r12) as u16;
3414                bytes.extend_from_slice(&hw1.to_le_bytes());
3415                bytes.extend_from_slice(&hw2.to_le_bytes());
3416
3417                // LSL.W rd_lo, rn_hi, R3 (rd_lo = hi << (n-32), new_lo high bits)
3418                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3419                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | r3) as u16;
3420                bytes.extend_from_slice(&hw1.to_le_bytes());
3421                bytes.extend_from_slice(&hw2.to_le_bytes());
3422
3423                // ORR.W rd_lo, rd_lo, R4 (rd_lo = (hi << (n-32)) | (lo >> (64-n)) = new_lo)
3424                let hw1: u16 = (0xEA40 | rd_lo_bits) as u16;
3425                let hw2: u16 = ((rd_lo_bits << 8) | r4) as u16;
3426                bytes.extend_from_slice(&hw1.to_le_bytes());
3427                bytes.extend_from_slice(&hw2.to_le_bytes());
3428
3429                // MOV rd_hi, shift (rd_hi = new_hi)
3430                let d_bit = (rd_hi_bits >> 3) & 1;
3431                let mov_instr: u16 =
3432                    (0x4600 | (d_bit << 7) | (shift_bits << 3) | (rd_hi_bits & 0x7)) as u16;
3433                bytes.extend_from_slice(&mov_instr.to_le_bytes());
3434
3435                // POP {R4}
3436                bytes.extend_from_slice(&0xBC10u16.to_le_bytes());
3437
3438                Ok(bytes) // Total: 74 bytes
3439            }
3440
3441            // I64Rotr: 64-bit rotate right
3442            // rotr(x, n) = rotl(x, 64-n)
3443            // For n < 32: new_lo = (lo >> n) | (hi << (32-n)), new_hi = (hi >> n) | (lo << (32-n))
3444            // For n >= 32: same formula but with lo/hi swapped, shift by (n-32)
3445            ArmOp::I64Rotr {
3446                rdlo,
3447                rdhi,
3448                rnlo,
3449                rnhi,
3450                shift,
3451            } => {
3452                let rd_lo_bits = reg_to_bits(rdlo);
3453                let rd_hi_bits = reg_to_bits(rdhi);
3454                let rn_lo_bits = reg_to_bits(rnlo);
3455                let rn_hi_bits = reg_to_bits(rnhi);
3456                let shift_bits = reg_to_bits(shift);
3457                let r12: u32 = 12;
3458                let r3: u32 = 3;
3459                let r4: u32 = 4;
3460                let mut bytes = Vec::new();
3461
3462                // PUSH {R4}
3463                bytes.extend_from_slice(&0xB410u16.to_le_bytes());
3464
3465                // AND.W shift, shift, #63
3466                let hw1: u16 = (0xF000 | shift_bits) as u16;
3467                let hw2: u16 = ((shift_bits << 8) | 0x3F) as u16;
3468                bytes.extend_from_slice(&hw1.to_le_bytes());
3469                bytes.extend_from_slice(&hw2.to_le_bytes());
3470
3471                // SUBS.W R3, shift, #32
3472                let hw1: u16 = (0xF1B0 | shift_bits) as u16;
3473                let hw2: u16 = ((r3 << 8) | 0x20) as u16;
3474                bytes.extend_from_slice(&hw1.to_le_bytes());
3475                bytes.extend_from_slice(&hw2.to_le_bytes());
3476
3477                // BPL .large (+14 halfwords)
3478                let bpl: u16 = 0xD50E;
3479                bytes.extend_from_slice(&bpl.to_le_bytes());
3480
3481                // === Small rotation (n < 32) ===
3482                // RSB.W R3, shift, #32 (R3 = 32-n)
3483                let hw1: u16 = (0xF1C0 | shift_bits) as u16;
3484                let hw2: u16 = ((r3 << 8) | 0x20) as u16;
3485                bytes.extend_from_slice(&hw1.to_le_bytes());
3486                bytes.extend_from_slice(&hw2.to_le_bytes());
3487
3488                // LSL.W R4, rn_hi, R3 (R4 = hi << (32-n), will go to new_lo)
3489                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3490                let hw2: u16 = (0xF000 | (r4 << 8) | r3) as u16;
3491                bytes.extend_from_slice(&hw1.to_le_bytes());
3492                bytes.extend_from_slice(&hw2.to_le_bytes());
3493
3494                // LSL.W R12, rn_lo, R3 (R12 = lo << (32-n), will go to new_hi)
3495                let hw1: u16 = (0xFA00 | rn_lo_bits) as u16;
3496                let hw2: u16 = (0xF000 | (r12 << 8) | r3) as u16;
3497                bytes.extend_from_slice(&hw1.to_le_bytes());
3498                bytes.extend_from_slice(&hw2.to_le_bytes());
3499
3500                // LSR.W rd_lo, rn_lo, shift (rd_lo = lo >> n)
3501                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3502                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | shift_bits) as u16;
3503                bytes.extend_from_slice(&hw1.to_le_bytes());
3504                bytes.extend_from_slice(&hw2.to_le_bytes());
3505
3506                // ORR.W rd_lo, rd_lo, R4 (rd_lo = (lo >> n) | (hi << (32-n)))
3507                let hw1: u16 = (0xEA40 | rd_lo_bits) as u16;
3508                let hw2: u16 = ((rd_lo_bits << 8) | r4) as u16;
3509                bytes.extend_from_slice(&hw1.to_le_bytes());
3510                bytes.extend_from_slice(&hw2.to_le_bytes());
3511
3512                // LSR.W rd_hi, rn_hi, shift (rd_hi = hi >> n)
3513                let hw1: u16 = (0xFA20 | rn_hi_bits) as u16;
3514                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | shift_bits) as u16;
3515                bytes.extend_from_slice(&hw1.to_le_bytes());
3516                bytes.extend_from_slice(&hw2.to_le_bytes());
3517
3518                // ORR.W rd_hi, rd_hi, R12 (rd_hi = (hi >> n) | (lo << (32-n)))
3519                let hw1: u16 = (0xEA40 | rd_hi_bits) as u16;
3520                let hw2: u16 = ((rd_hi_bits << 8) | r12) as u16;
3521                bytes.extend_from_slice(&hw1.to_le_bytes());
3522                bytes.extend_from_slice(&hw2.to_le_bytes());
3523
3524                // B .done (+14 halfwords)
3525                let b_done: u16 = 0xE00E;
3526                bytes.extend_from_slice(&b_done.to_le_bytes());
3527
3528                // === Large rotation (n >= 32) ===
3529                // RSB.W R4, R3, #32 (R4 = 64-n)
3530                let hw1: u16 = (0xF1C0 | r3) as u16;
3531                let hw2: u16 = ((r4 << 8) | 0x20) as u16;
3532                bytes.extend_from_slice(&hw1.to_le_bytes());
3533                bytes.extend_from_slice(&hw2.to_le_bytes());
3534
3535                // LSL.W R12, rn_lo, R4 (R12 = lo << (64-n), goes to new_lo low bits)
3536                let hw1: u16 = (0xFA00 | rn_lo_bits) as u16;
3537                let hw2: u16 = (0xF000 | (r12 << 8) | r4) as u16;
3538                bytes.extend_from_slice(&hw1.to_le_bytes());
3539                bytes.extend_from_slice(&hw2.to_le_bytes());
3540
3541                // LSL.W R4, rn_hi, R4 (R4 = hi << (64-n), goes to new_hi low bits)
3542                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3543                let hw2: u16 = (0xF000 | (r4 << 8) | r4) as u16;
3544                bytes.extend_from_slice(&hw1.to_le_bytes());
3545                bytes.extend_from_slice(&hw2.to_le_bytes());
3546
3547                // LSR.W shift, rn_hi, R3 (shift = hi >> (n-32), new_lo high bits)
3548                let hw1: u16 = (0xFA20 | rn_hi_bits) as u16;
3549                let hw2: u16 = (0xF000 | (shift_bits << 8) | r3) as u16;
3550                bytes.extend_from_slice(&hw1.to_le_bytes());
3551                bytes.extend_from_slice(&hw2.to_le_bytes());
3552
3553                // ORR.W shift, shift, R12 (shift = (hi >> (n-32)) | (lo << (64-n)) = new_lo)
3554                let hw1: u16 = (0xEA40 | shift_bits) as u16;
3555                let hw2: u16 = ((shift_bits << 8) | r12) as u16;
3556                bytes.extend_from_slice(&hw1.to_le_bytes());
3557                bytes.extend_from_slice(&hw2.to_le_bytes());
3558
3559                // LSR.W rd_hi, rn_lo, R3 (rd_hi = lo >> (n-32), new_hi high bits)
3560                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3561                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | r3) as u16;
3562                bytes.extend_from_slice(&hw1.to_le_bytes());
3563                bytes.extend_from_slice(&hw2.to_le_bytes());
3564
3565                // ORR.W rd_hi, rd_hi, R4 (rd_hi = (lo >> (n-32)) | (hi << (64-n)) = new_hi)
3566                let hw1: u16 = (0xEA40 | rd_hi_bits) as u16;
3567                let hw2: u16 = ((rd_hi_bits << 8) | r4) as u16;
3568                bytes.extend_from_slice(&hw1.to_le_bytes());
3569                bytes.extend_from_slice(&hw2.to_le_bytes());
3570
3571                // MOV rd_lo, shift (rd_lo = new_lo)
3572                let d_bit = (rd_lo_bits >> 3) & 1;
3573                let mov_instr: u16 =
3574                    (0x4600 | (d_bit << 7) | (shift_bits << 3) | (rd_lo_bits & 0x7)) as u16;
3575                bytes.extend_from_slice(&mov_instr.to_le_bytes());
3576
3577                // POP {R4}
3578                bytes.extend_from_slice(&0xBC10u16.to_le_bytes());
3579
3580                Ok(bytes) // Total: 74 bytes
3581            }
3582
3583            // I64Clz: Count leading zeros in 64-bit value
3584            // If hi != 0: result = CLZ(hi)
3585            // If hi == 0: result = 32 + CLZ(lo)
3586            //
3587            // Layout (using CMP+BNE approach for consistency):
3588            // 0: CMP.W rnhi, #0 (4 bytes)
3589            // 4: BEQ .hi_zero (2 bytes) - branch forward to offset 14
3590            // 6: CLZ.W rd, rnhi (4 bytes)
3591            // 10: B .done (2 bytes) - branch forward to offset 22
3592            // 12: NOP (2 bytes) - padding for alignment
3593            // 14: .hi_zero: CLZ.W rd, rnlo (4 bytes)
3594            // 18: ADD.W rd, rd, #32 (4 bytes)
3595            // 22: .done
3596            ArmOp::I64Clz { rd, rnlo, rnhi } => {
3597                let rd_bits = reg_to_bits(rd);
3598                let rn_lo_bits = reg_to_bits(rnlo);
3599                let rn_hi_bits = reg_to_bits(rnhi);
3600                let mut bytes = Vec::new();
3601
3602                // CMP.W rnhi, #0 (4 bytes at offset 0)
3603                let hw1: u16 = (0xF1B0 | rn_hi_bits) as u16;
3604                let hw2: u16 = 0x0F00;
3605                bytes.extend_from_slice(&hw1.to_le_bytes());
3606                bytes.extend_from_slice(&hw2.to_le_bytes());
3607
3608                // BEQ .hi_zero (2 bytes at offset 4)
3609                // PC = 4 + 4 = 8, target = 14, offset = 6, imm8 = 3
3610                let beq: u16 = 0xD003;
3611                bytes.extend_from_slice(&beq.to_le_bytes());
3612
3613                // CLZ.W rd, rnhi (4 bytes at offset 6)
3614                // CLZ T1: hw1 = 0xFAB<Rm>, hw2 = 0xF<Rd>8<Rm>
3615                let hw1: u16 = (0xFAB0 | rn_hi_bits) as u16;
3616                let hw2: u16 = (0xF080 | (rd_bits << 8) | rn_hi_bits) as u16;
3617                bytes.extend_from_slice(&hw1.to_le_bytes());
3618                bytes.extend_from_slice(&hw2.to_le_bytes());
3619
3620                // B .done (2 bytes at offset 10)
3621                // PC = 10 + 4 = 14, target = 22, offset = 8, imm11 = 4
3622                let b_done: u16 = 0xE004;
3623                bytes.extend_from_slice(&b_done.to_le_bytes());
3624
3625                // NOP (2 bytes at offset 12) - padding
3626                bytes.extend_from_slice(&0xBF00u16.to_le_bytes());
3627
3628                // .hi_zero: (offset 14)
3629                // CLZ.W rd, rnlo (4 bytes)
3630                // CLZ T1: hw1 = 0xFAB<Rm>, hw2 = 0xF<Rd>8<Rm>
3631                let hw1: u16 = (0xFAB0 | rn_lo_bits) as u16;
3632                let hw2: u16 = (0xF080 | (rd_bits << 8) | rn_lo_bits) as u16;
3633                bytes.extend_from_slice(&hw1.to_le_bytes());
3634                bytes.extend_from_slice(&hw2.to_le_bytes());
3635
3636                // ADD.W rd, rd, #32 (4 bytes at offset 18)
3637                let hw1: u16 = (0xF100 | rd_bits) as u16;
3638                let hw2: u16 = ((rd_bits << 8) | 0x20) as u16;
3639                bytes.extend_from_slice(&hw1.to_le_bytes());
3640                bytes.extend_from_slice(&hw2.to_le_bytes());
3641
3642                // .done: (offset 22)
3643                // i64.clz returns i64, so clear high word: MOV rnhi, #0 (2 bytes)
3644                // MOVS Rn, #0: 0010 0 Rn 00000000
3645                let mov0: u16 = (0x2000 | (rn_hi_bits << 8)) as u16;
3646                bytes.extend_from_slice(&mov0.to_le_bytes());
3647
3648                Ok(bytes)
3649            }
3650
3651            // I64Ctz: Count trailing zeros in 64-bit value
3652            // If lo != 0: result = CTZ(lo) = CLZ(RBIT(lo))
3653            // If lo == 0: result = 32 + CTZ(hi) = 32 + CLZ(RBIT(hi))
3654            //
3655            // Layout:
3656            // 0: CMP.W rnlo, #0 (4 bytes)
3657            // 4: BEQ .lo_zero (2 bytes) - branch to offset 18
3658            // 6: RBIT.W rd, rnlo (4 bytes)
3659            // 10: CLZ.W rd, rd (4 bytes)
3660            // 14: B .done (2 bytes) - branch to offset 30
3661            // 16: NOP (2 bytes) - padding
3662            // 18: .lo_zero: RBIT.W rd, rnhi (4 bytes)
3663            // 22: CLZ.W rd, rd (4 bytes)
3664            // 26: ADD.W rd, rd, #32 (4 bytes)
3665            // 30: .done
3666            ArmOp::I64Ctz { rd, rnlo, rnhi } => {
3667                let rd_bits = reg_to_bits(rd);
3668                let rn_lo_bits = reg_to_bits(rnlo);
3669                let rn_hi_bits = reg_to_bits(rnhi);
3670                let mut bytes = Vec::new();
3671
3672                // CMP.W rnlo, #0 (4 bytes at offset 0)
3673                let hw1: u16 = (0xF1B0 | rn_lo_bits) as u16;
3674                let hw2: u16 = 0x0F00;
3675                bytes.extend_from_slice(&hw1.to_le_bytes());
3676                bytes.extend_from_slice(&hw2.to_le_bytes());
3677
3678                // BEQ .lo_zero (2 bytes at offset 4)
3679                // PC = 4 + 4 = 8, target = 18, offset = 10, imm8 = 5
3680                let beq: u16 = 0xD005;
3681                bytes.extend_from_slice(&beq.to_le_bytes());
3682
3683                // RBIT.W rd, rnlo (4 bytes at offset 6)
3684                // RBIT T1: hw1 = 0xFA9<Rm>, hw2 = 0xF<Rd>A<Rm>
3685                let hw1: u16 = (0xFA90 | rn_lo_bits) as u16;
3686                let hw2: u16 = (0xF0A0 | (rd_bits << 8) | rn_lo_bits) as u16;
3687                bytes.extend_from_slice(&hw1.to_le_bytes());
3688                bytes.extend_from_slice(&hw2.to_le_bytes());
3689
3690                // CLZ.W rd, rd (4 bytes at offset 10)
3691                // CLZ T1: hw1 = 0xFAB<Rm>, hw2 = 0xF<Rd>8<Rm>
3692                let hw1: u16 = (0xFAB0 | rd_bits) as u16;
3693                let hw2: u16 = (0xF080 | (rd_bits << 8) | rd_bits) as u16;
3694                bytes.extend_from_slice(&hw1.to_le_bytes());
3695                bytes.extend_from_slice(&hw2.to_le_bytes());
3696
3697                // B .done (2 bytes at offset 14)
3698                // PC = 14 + 4 = 18, target = 30, offset = 12, imm11 = 6
3699                let b_done: u16 = 0xE006;
3700                bytes.extend_from_slice(&b_done.to_le_bytes());
3701
3702                // NOP (2 bytes at offset 16) - padding
3703                bytes.extend_from_slice(&0xBF00u16.to_le_bytes());
3704
3705                // .lo_zero: (offset 18)
3706                // RBIT.W rd, rnhi (4 bytes)
3707                // RBIT T1: hw1 = 0xFA9<Rm>, hw2 = 0xF<Rd>A<Rm>
3708                let hw1: u16 = (0xFA90 | rn_hi_bits) as u16;
3709                let hw2: u16 = (0xF0A0 | (rd_bits << 8) | rn_hi_bits) as u16;
3710                bytes.extend_from_slice(&hw1.to_le_bytes());
3711                bytes.extend_from_slice(&hw2.to_le_bytes());
3712
3713                // CLZ.W rd, rd (4 bytes at offset 22)
3714                // CLZ T1: hw1 = 0xFAB<Rm>, hw2 = 0xF<Rd>8<Rm>
3715                let hw1: u16 = (0xFAB0 | rd_bits) as u16;
3716                let hw2: u16 = (0xF080 | (rd_bits << 8) | rd_bits) as u16;
3717                bytes.extend_from_slice(&hw1.to_le_bytes());
3718                bytes.extend_from_slice(&hw2.to_le_bytes());
3719
3720                // ADD.W rd, rd, #32 (4 bytes at offset 26)
3721                let hw1: u16 = (0xF100 | rd_bits) as u16;
3722                let hw2: u16 = ((rd_bits << 8) | 0x20) as u16;
3723                bytes.extend_from_slice(&hw1.to_le_bytes());
3724                bytes.extend_from_slice(&hw2.to_le_bytes());
3725
3726                // .done: (offset 30)
3727                // i64.ctz returns i64, so clear high word: MOV rnhi, #0 (2 bytes)
3728                let mov0: u16 = (0x2000 | (rn_hi_bits << 8)) as u16;
3729                bytes.extend_from_slice(&mov0.to_le_bytes());
3730
3731                Ok(bytes)
3732            }
3733
3734            // I64Popcnt: Population count of 64-bit value
3735            // result = POPCNT(lo) + POPCNT(hi)
3736            // Using SIMD-style parallel bit counting algorithm
3737            ArmOp::I64Popcnt { rd, rnlo, rnhi } => {
3738                let rd_bits = reg_to_bits(rd);
3739                let rn_lo_bits = reg_to_bits(rnlo);
3740                let rn_hi_bits = reg_to_bits(rnhi);
3741                let r12: u32 = 12; // IP scratch
3742                let r3: u32 = 3; // Scratch for hi popcnt result
3743                let mut bytes = Vec::new();
3744
3745                // PUSH {R3, R4, R5} - save scratch registers
3746                bytes.extend_from_slice(&0xB438u16.to_le_bytes());
3747
3748                // Strategy: compute popcnt(lo) -> R4, popcnt(hi) -> R5, add them -> rd
3749                // Using lookup table approach for each byte would be too large
3750                // Using shift-and-add approach instead
3751
3752                // For simplicity and correctness, use the efficient parallel algorithm
3753                // but implement it as a series of inline operations
3754
3755                // MOV R4, rnlo
3756                let d_bit: u32 = 0; // R4 < 8, so high bit is 0
3757                let mov: u16 = (0x4600 | (d_bit << 7) | (rn_lo_bits << 3) | (4 & 0x7)) as u16;
3758                bytes.extend_from_slice(&mov.to_le_bytes());
3759
3760                // MOV R5, rnhi
3761                let d_bit: u32 = 0; // R5 < 8, so high bit is 0
3762                let mov: u16 = (0x4600 | (d_bit << 7) | (rn_hi_bits << 3) | (5 & 0x7)) as u16;
3763                bytes.extend_from_slice(&mov.to_le_bytes());
3764
3765                // --- POPCNT for R4 (lo word) ---
3766                // Step 1: x = x - ((x >> 1) & 0x55555555)
3767                // LSR.W R12, R4, #1
3768                let hw1: u16 = 0xEA4F;
3769                let hw2: u16 = ((r12 << 8) | 0x50 | 4) as u16;
3770                bytes.extend_from_slice(&hw1.to_le_bytes());
3771                bytes.extend_from_slice(&hw2.to_le_bytes());
3772
3773                // Load 0x55555555 into R3 using MOVW/MOVT
3774                // MOVW R3, #0x5555
3775                bytes.extend_from_slice(&0xF245u16.to_le_bytes());
3776                bytes.extend_from_slice(&0x5355u16.to_le_bytes());
3777                // MOVT R3, #0x5555
3778                bytes.extend_from_slice(&0xF2C5u16.to_le_bytes());
3779                bytes.extend_from_slice(&0x5355u16.to_le_bytes());
3780
3781                // AND.W R12, R12, R3
3782                let hw1: u16 = (0xEA00 | r12) as u16;
3783                let hw2: u16 = ((r12 << 8) | r3) as u16;
3784                bytes.extend_from_slice(&hw1.to_le_bytes());
3785                bytes.extend_from_slice(&hw2.to_le_bytes());
3786
3787                // SUB.W R4, R4, R12
3788                let hw1: u16 = (0xEBA0 | 4) as u16;
3789                let hw2: u16 = ((4 << 8) | r12) as u16;
3790                bytes.extend_from_slice(&hw1.to_le_bytes());
3791                bytes.extend_from_slice(&hw2.to_le_bytes());
3792
3793                // Step 2: x = (x & 0x33333333) + ((x >> 2) & 0x33333333)
3794                // Load 0x33333333 into R3
3795                // MOVW R3, #0x3333
3796                bytes.extend_from_slice(&0xF243u16.to_le_bytes());
3797                bytes.extend_from_slice(&0x3333u16.to_le_bytes());
3798                // MOVT R3, #0x3333
3799                bytes.extend_from_slice(&0xF2C3u16.to_le_bytes());
3800                bytes.extend_from_slice(&0x3333u16.to_le_bytes());
3801
3802                // AND.W R12, R4, R3
3803                let hw1: u16 = (0xEA00 | 4) as u16;
3804                let hw2: u16 = ((r12 << 8) | r3) as u16;
3805                bytes.extend_from_slice(&hw1.to_le_bytes());
3806                bytes.extend_from_slice(&hw2.to_le_bytes());
3807
3808                // LSR.W R4, R4, #2
3809                let hw1: u16 = 0xEA4F;
3810                let hw2: u16 = ((4 << 8) | 0x90 | 4) as u16;
3811                bytes.extend_from_slice(&hw1.to_le_bytes());
3812                bytes.extend_from_slice(&hw2.to_le_bytes());
3813
3814                // AND.W R4, R4, R3
3815                let hw1: u16 = (0xEA00 | 4) as u16;
3816                let hw2: u16 = ((4 << 8) | r3) as u16;
3817                bytes.extend_from_slice(&hw1.to_le_bytes());
3818                bytes.extend_from_slice(&hw2.to_le_bytes());
3819
3820                // ADD.W R4, R4, R12
3821                let hw1: u16 = (0xEB00 | 4) as u16;
3822                let hw2: u16 = ((4 << 8) | r12) as u16;
3823                bytes.extend_from_slice(&hw1.to_le_bytes());
3824                bytes.extend_from_slice(&hw2.to_le_bytes());
3825
3826                // Step 3: x = (x + (x >> 4)) & 0x0F0F0F0F
3827                // LSR.W R12, R4, #4
3828                // hw2 = (imm3 << 12) | (Rd << 8) | (imm2 << 6) | (type << 4) | Rm
3829                // imm5=4=00100 → imm3=1, imm2=0, type=01(LSR)
3830                let hw1: u16 = 0xEA4F;
3831                let hw2: u16 = (0x1000 | (r12 << 8) | 0x10 | 4) as u16;
3832                bytes.extend_from_slice(&hw1.to_le_bytes());
3833                bytes.extend_from_slice(&hw2.to_le_bytes());
3834
3835                // ADD.W R4, R4, R12
3836                let hw1: u16 = (0xEB00 | 4) as u16;
3837                let hw2: u16 = ((4 << 8) | r12) as u16;
3838                bytes.extend_from_slice(&hw1.to_le_bytes());
3839                bytes.extend_from_slice(&hw2.to_le_bytes());
3840
3841                // Load 0x0F0F0F0F into R3
3842                // MOVW R3, #0x0F0F (imm4=0, i=1, imm3=7, imm8=0x0F)
3843                // hw1 = 11110 1 10 0100 0000 = 0xF640
3844                // hw2 = 0 111 0011 00001111 = 0x730F
3845                bytes.extend_from_slice(&0xF640u16.to_le_bytes());
3846                bytes.extend_from_slice(&0x730Fu16.to_le_bytes());
3847                // MOVT R3, #0x0F0F
3848                bytes.extend_from_slice(&0xF6C0u16.to_le_bytes());
3849                bytes.extend_from_slice(&0x730Fu16.to_le_bytes());
3850
3851                // AND.W R4, R4, R3
3852                let hw1: u16 = (0xEA00 | 4) as u16;
3853                let hw2: u16 = ((4 << 8) | r3) as u16;
3854                bytes.extend_from_slice(&hw1.to_le_bytes());
3855                bytes.extend_from_slice(&hw2.to_le_bytes());
3856
3857                // Step 4: x = x * 0x01010101 >> 24
3858                // Load 0x01010101 into R3
3859                // MOVW R3, #0x0101
3860                bytes.extend_from_slice(&0xF240u16.to_le_bytes());
3861                bytes.extend_from_slice(&0x1301u16.to_le_bytes());
3862                // MOVT R3, #0x0101
3863                bytes.extend_from_slice(&0xF2C0u16.to_le_bytes());
3864                bytes.extend_from_slice(&0x1301u16.to_le_bytes());
3865
3866                // MUL R4, R4, R3
3867                // MUL T2: hw1 = 0xFB00|Rn, hw2 = 0xF000|(Rd<<8)|Rm
3868                let hw1: u16 = (0xFB00 | 4) as u16;
3869                let hw2: u16 = (0xF000 | (4 << 8) | r3) as u16;
3870                bytes.extend_from_slice(&hw1.to_le_bytes());
3871                bytes.extend_from_slice(&hw2.to_le_bytes());
3872
3873                // LSR.W R4, R4, #24
3874                // imm5=24=11000 → imm3=6, imm2=0, type=01(LSR)
3875                let hw1: u16 = 0xEA4F;
3876                let hw2: u16 = (0x6000 | (4 << 8) | 0x10 | 4) as u16;
3877                bytes.extend_from_slice(&hw1.to_le_bytes());
3878                bytes.extend_from_slice(&hw2.to_le_bytes());
3879
3880                // --- POPCNT for R5 (hi word) - same algorithm ---
3881                // Step 1
3882                let hw1: u16 = 0xEA4F;
3883                let hw2: u16 = ((r12 << 8) | 0x50 | 5) as u16;
3884                bytes.extend_from_slice(&hw1.to_le_bytes());
3885                bytes.extend_from_slice(&hw2.to_le_bytes());
3886
3887                // Load 0x55555555 into R3
3888                bytes.extend_from_slice(&0xF245u16.to_le_bytes());
3889                bytes.extend_from_slice(&0x5355u16.to_le_bytes());
3890                bytes.extend_from_slice(&0xF2C5u16.to_le_bytes());
3891                bytes.extend_from_slice(&0x5355u16.to_le_bytes());
3892
3893                let hw1: u16 = (0xEA00 | r12) as u16;
3894                let hw2: u16 = ((r12 << 8) | r3) as u16;
3895                bytes.extend_from_slice(&hw1.to_le_bytes());
3896                bytes.extend_from_slice(&hw2.to_le_bytes());
3897
3898                let hw1: u16 = (0xEBA0 | 5) as u16;
3899                let hw2: u16 = ((5 << 8) | r12) as u16;
3900                bytes.extend_from_slice(&hw1.to_le_bytes());
3901                bytes.extend_from_slice(&hw2.to_le_bytes());
3902
3903                // Step 2
3904                bytes.extend_from_slice(&0xF243u16.to_le_bytes());
3905                bytes.extend_from_slice(&0x3333u16.to_le_bytes());
3906                bytes.extend_from_slice(&0xF2C3u16.to_le_bytes());
3907                bytes.extend_from_slice(&0x3333u16.to_le_bytes());
3908
3909                let hw1: u16 = (0xEA00 | 5) as u16;
3910                let hw2: u16 = ((r12 << 8) | r3) as u16;
3911                bytes.extend_from_slice(&hw1.to_le_bytes());
3912                bytes.extend_from_slice(&hw2.to_le_bytes());
3913
3914                let hw1: u16 = 0xEA4F;
3915                let hw2: u16 = ((5 << 8) | 0x90 | 5) as u16;
3916                bytes.extend_from_slice(&hw1.to_le_bytes());
3917                bytes.extend_from_slice(&hw2.to_le_bytes());
3918
3919                let hw1: u16 = (0xEA00 | 5) as u16;
3920                let hw2: u16 = ((5 << 8) | r3) as u16;
3921                bytes.extend_from_slice(&hw1.to_le_bytes());
3922                bytes.extend_from_slice(&hw2.to_le_bytes());
3923
3924                let hw1: u16 = (0xEB00 | 5) as u16;
3925                let hw2: u16 = ((5 << 8) | r12) as u16;
3926                bytes.extend_from_slice(&hw1.to_le_bytes());
3927                bytes.extend_from_slice(&hw2.to_le_bytes());
3928
3929                // Step 3: LSR.W R12, R5, #4
3930                // imm5=4=00100 → imm3=1, imm2=0, type=01(LSR)
3931                let hw1: u16 = 0xEA4F;
3932                let hw2: u16 = (0x1000 | (r12 << 8) | 0x10 | 5) as u16;
3933                bytes.extend_from_slice(&hw1.to_le_bytes());
3934                bytes.extend_from_slice(&hw2.to_le_bytes());
3935
3936                let hw1: u16 = (0xEB00 | 5) as u16;
3937                let hw2: u16 = ((5 << 8) | r12) as u16;
3938                bytes.extend_from_slice(&hw1.to_le_bytes());
3939                bytes.extend_from_slice(&hw2.to_le_bytes());
3940
3941                // Load 0x0F0F0F0F into R3 (for hi-word)
3942                bytes.extend_from_slice(&0xF640u16.to_le_bytes());
3943                bytes.extend_from_slice(&0x730Fu16.to_le_bytes());
3944                bytes.extend_from_slice(&0xF6C0u16.to_le_bytes());
3945                bytes.extend_from_slice(&0x730Fu16.to_le_bytes());
3946
3947                let hw1: u16 = (0xEA00 | 5) as u16;
3948                let hw2: u16 = ((5 << 8) | r3) as u16;
3949                bytes.extend_from_slice(&hw1.to_le_bytes());
3950                bytes.extend_from_slice(&hw2.to_le_bytes());
3951
3952                // Step 4
3953                bytes.extend_from_slice(&0xF240u16.to_le_bytes());
3954                bytes.extend_from_slice(&0x1301u16.to_le_bytes());
3955                bytes.extend_from_slice(&0xF2C0u16.to_le_bytes());
3956                bytes.extend_from_slice(&0x1301u16.to_le_bytes());
3957
3958                // MUL R5, R5, R3
3959                // MUL T2: hw1 = 0xFB00|Rn, hw2 = 0xF000|(Rd<<8)|Rm
3960                let hw1: u16 = (0xFB00 | 5) as u16;
3961                let hw2: u16 = (0xF000 | (5 << 8) | r3) as u16;
3962                bytes.extend_from_slice(&hw1.to_le_bytes());
3963                bytes.extend_from_slice(&hw2.to_le_bytes());
3964
3965                // LSR.W R5, R5, #24
3966                // imm5=24=11000 → imm3=6, imm2=0, type=01(LSR)
3967                let hw1: u16 = 0xEA4F;
3968                let hw2: u16 = (0x6000 | (5 << 8) | 0x10 | 5) as u16;
3969                bytes.extend_from_slice(&hw1.to_le_bytes());
3970                bytes.extend_from_slice(&hw2.to_le_bytes());
3971
3972                // ADD rd, R4, R5 (combine lo and hi counts)
3973                // ADDS Rd, Rn, Rm (T1): 0001 100 Rm Rn Rd = 0x1800 | (Rm<<6) | (Rn<<3) | Rd
3974                let rd_bits_u16 = rd_bits as u16;
3975                let instr: u16 = 0x1800 | (5 << 6) | (4 << 3) | rd_bits_u16;
3976                bytes.extend_from_slice(&instr.to_le_bytes());
3977
3978                // POP {R3, R4, R5}
3979                bytes.extend_from_slice(&0xBC38u16.to_le_bytes());
3980
3981                // i64.popcnt returns i64, so clear high word: MOV rnhi, #0 (2 bytes)
3982                let mov0: u16 = (0x2000 | (rn_hi_bits << 8)) as u16;
3983                bytes.extend_from_slice(&mov0.to_le_bytes());
3984
3985                Ok(bytes)
3986            }
3987
3988            // I64Extend8S: Sign-extend low 8 bits to 64 bits
3989            // Result: rdlo = sign_extend_8(rnlo), rdhi = rdlo >> 31
3990            ArmOp::I64Extend8S { rdlo, rdhi, rnlo } => {
3991                let rdlo_bits = reg_to_bits(rdlo);
3992                let rdhi_bits = reg_to_bits(rdhi);
3993                let rnlo_bits = reg_to_bits(rnlo);
3994                let mut bytes = Vec::new();
3995
3996                // SXTB.W rdlo, rnlo (sign-extend byte to 32-bit)
3997                // SXTB T2: hw1 = 0xFA4F, hw2 = 0xF0<Rd><Rm>
3998                let hw1: u16 = 0xFA4F_u16;
3999                let hw2: u16 = (0xF080 | (rdlo_bits << 8) | rnlo_bits) as u16;
4000                bytes.extend_from_slice(&hw1.to_le_bytes());
4001                bytes.extend_from_slice(&hw2.to_le_bytes());
4002
4003                // ASR.W rdhi, rdlo, #31 (sign-extend to high word)
4004                // ASR (immediate): hw1 = 0xEA4F, hw2 = imm3:Rd:imm2:type:Rm
4005                // For imm5=31: imm3=111, imm2=11, type=10 (ASR)
4006                // hw2 = (7 << 12) | (rdhi << 8) | (3 << 6) | (2 << 4) | rdlo
4007                let hw1: u16 = 0xEA4F;
4008                let hw2: u16 = (0x70E0 | (rdhi_bits << 8) | rdlo_bits) as u16;
4009                bytes.extend_from_slice(&hw1.to_le_bytes());
4010                bytes.extend_from_slice(&hw2.to_le_bytes());
4011
4012                Ok(bytes)
4013            }
4014
4015            // I64Extend16S: Sign-extend low 16 bits to 64 bits
4016            // Result: rdlo = sign_extend_16(rnlo), rdhi = rdlo >> 31
4017            ArmOp::I64Extend16S { rdlo, rdhi, rnlo } => {
4018                let rdlo_bits = reg_to_bits(rdlo);
4019                let rdhi_bits = reg_to_bits(rdhi);
4020                let rnlo_bits = reg_to_bits(rnlo);
4021                let mut bytes = Vec::new();
4022
4023                // SXTH.W rdlo, rnlo (sign-extend halfword to 32-bit)
4024                // SXTH T2: hw1 = 0xFA0F, hw2 = 0xF0<Rd><Rm>
4025                let hw1: u16 = 0xFA0F_u16;
4026                let hw2: u16 = (0xF080 | (rdlo_bits << 8) | rnlo_bits) as u16;
4027                bytes.extend_from_slice(&hw1.to_le_bytes());
4028                bytes.extend_from_slice(&hw2.to_le_bytes());
4029
4030                // ASR.W rdhi, rdlo, #31 (sign-extend to high word)
4031                let hw1: u16 = 0xEA4F;
4032                let hw2: u16 = (0x70E0 | (rdhi_bits << 8) | rdlo_bits) as u16;
4033                bytes.extend_from_slice(&hw1.to_le_bytes());
4034                bytes.extend_from_slice(&hw2.to_le_bytes());
4035
4036                Ok(bytes)
4037            }
4038
4039            // I64Extend32S: Sign-extend low 32 bits to 64 bits
4040            // Result: rdlo = rnlo, rdhi = rnlo >> 31
4041            ArmOp::I64Extend32S { rdlo, rdhi, rnlo } => {
4042                let rdlo_bits = reg_to_bits(rdlo);
4043                let rdhi_bits = reg_to_bits(rdhi);
4044                let rnlo_bits = reg_to_bits(rnlo);
4045                let mut bytes = Vec::new();
4046
4047                // MOV rdlo, rnlo (if different)
4048                if rdlo_bits != rnlo_bits {
4049                    // MOV Rd, Rm (16-bit): 0100 0110 D Rm Rd[2:0]
4050                    let d_bit = ((rdlo_bits >> 3) & 1) as u16;
4051                    let mov: u16 = 0x4600
4052                        | (d_bit << 7)
4053                        | ((rnlo_bits as u16) << 3)
4054                        | ((rdlo_bits & 0x7) as u16);
4055                    bytes.extend_from_slice(&mov.to_le_bytes());
4056                }
4057
4058                // ASR.W rdhi, rnlo, #31 (sign-extend to high word)
4059                let hw1: u16 = 0xEA4F;
4060                let hw2: u16 = (0x70E0 | (rdhi_bits << 8) | rnlo_bits) as u16;
4061                bytes.extend_from_slice(&hw1.to_le_bytes());
4062                bytes.extend_from_slice(&hw2.to_le_bytes());
4063
4064                Ok(bytes)
4065            }
4066
4067            // SelectMove: IT <cond>; MOV{cond} rd, rm
4068            // Conditional move: only execute MOV if condition is true
4069            ArmOp::SelectMove { rd, rm, cond } => {
4070                let rd_bits = reg_to_bits(rd) as u16;
4071                let rm_bits = reg_to_bits(rm) as u16;
4072
4073                // Condition code encoding for IT block
4074                use synth_synthesis::Condition;
4075                let cond_bits: u16 = match cond {
4076                    Condition::EQ => 0x0, // Equal
4077                    Condition::NE => 0x1, // Not equal
4078                    Condition::HS => 0x2, // Higher or same (unsigned >=)
4079                    Condition::LO => 0x3, // Lower (unsigned <)
4080                    Condition::HI => 0x8, // Higher (unsigned >)
4081                    Condition::LS => 0x9, // Lower or same (unsigned <=)
4082                    Condition::GE => 0xA, // Greater or equal (signed)
4083                    Condition::LT => 0xB, // Less than (signed)
4084                    Condition::GT => 0xC, // Greater than (signed)
4085                    Condition::LE => 0xD, // Less or equal (signed)
4086                };
4087
4088                // IT <cond>: single Then block (mask = 0x8 for T only)
4089                // IT instruction: 1011 1111 firstcond mask
4090                let it_instr: u16 = 0xBF00 | (cond_bits << 4) | 0x8;
4091
4092                // MOV Rd, Rm (16-bit): 0100 0110 D Rm Rd[2:0]
4093                // This MOV will only execute if condition is true due to IT block
4094                let d_bit = (rd_bits >> 3) & 1;
4095                let mov_instr: u16 = 0x4600 | (d_bit << 7) | (rm_bits << 3) | (rd_bits & 0x7);
4096
4097                // Emit: IT <cond>, MOV rd, rm
4098                let mut bytes = it_instr.to_le_bytes().to_vec();
4099                bytes.extend_from_slice(&mov_instr.to_le_bytes());
4100                Ok(bytes)
4101            }
4102
4103            // Popcnt: Population count (count set bits)
4104            // ARM Cortex-M has no native POPCNT, so we implement the bit manipulation algorithm:
4105            // x = x - ((x >> 1) & 0x55555555);
4106            // x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
4107            // x = (x + (x >> 4)) & 0x0F0F0F0F;
4108            // x = x + (x >> 8);
4109            // x = x + (x >> 16);
4110            // return x & 0x3F;
4111            //
4112            // Uses rd as working register and R12 as scratch for constants
4113            ArmOp::Popcnt { rd, rm } => {
4114                let mut bytes = Vec::new();
4115
4116                // First, move rm to rd if they're different
4117                if rd != rm {
4118                    let rd_bits = reg_to_bits(rd) as u16;
4119                    let rm_bits = reg_to_bits(rm) as u16;
4120                    // MOV Rd, Rm (16-bit): 0100 0110 D Rm Rd[2:0]
4121                    let d_bit = (rd_bits >> 3) & 1;
4122                    let mov_instr: u16 = 0x4600 | (d_bit << 7) | (rm_bits << 3) | (rd_bits & 0x7);
4123                    bytes.extend_from_slice(&mov_instr.to_le_bytes());
4124                }
4125
4126                // Step 1: x = x - ((x >> 1) & 0x55555555)
4127                // Load 0x55555555 into R12
4128                bytes.extend_from_slice(&self.encode_thumb32_movw_raw(12, 0x5555)?);
4129                bytes.extend_from_slice(&self.encode_thumb32_movt_raw(12, 0x5555)?);
4130
4131                // R12_temp = rd >> 1
4132                // We need a second scratch register. Use R11.
4133                bytes.extend_from_slice(&self.encode_thumb32_lsr_raw(11, reg_to_bits(rd), 1)?);
4134
4135                // R11 = R11 & R12 (R11 = (x >> 1) & 0x55555555)
4136                bytes.extend_from_slice(&self.encode_thumb32_and_reg_raw(11, 11, 12)?);
4137
4138                // rd = rd - R11
4139                bytes.extend_from_slice(&self.encode_thumb32_sub_reg_raw(
4140                    reg_to_bits(rd),
4141                    reg_to_bits(rd),
4142                    11,
4143                )?);
4144
4145                // Step 2: x = (x & 0x33333333) + ((x >> 2) & 0x33333333)
4146                // Load 0x33333333 into R12
4147                bytes.extend_from_slice(&self.encode_thumb32_movw_raw(12, 0x3333)?);
4148                bytes.extend_from_slice(&self.encode_thumb32_movt_raw(12, 0x3333)?);
4149
4150                // R11 = rd & R12
4151                bytes.extend_from_slice(&self.encode_thumb32_and_reg_raw(
4152                    11,
4153                    reg_to_bits(rd),
4154                    12,
4155                )?);
4156
4157                // rd = rd >> 2
4158                bytes.extend_from_slice(&self.encode_thumb32_lsr_raw(
4159                    reg_to_bits(rd),
4160                    reg_to_bits(rd),
4161                    2,
4162                )?);
4163
4164                // rd = rd & R12
4165                bytes.extend_from_slice(&self.encode_thumb32_and_reg_raw(
4166                    reg_to_bits(rd),
4167                    reg_to_bits(rd),
4168                    12,
4169                )?);
4170
4171                // rd = rd + R11
4172                bytes.extend_from_slice(&self.encode_thumb32_add_reg_raw(
4173                    reg_to_bits(rd),
4174                    reg_to_bits(rd),
4175                    11,
4176                )?);
4177
4178                // Step 3: x = (x + (x >> 4)) & 0x0F0F0F0F
4179                // R11 = rd >> 4
4180                bytes.extend_from_slice(&self.encode_thumb32_lsr_raw(11, reg_to_bits(rd), 4)?);
4181
4182                // rd = rd + R11
4183                bytes.extend_from_slice(&self.encode_thumb32_add_reg_raw(
4184                    reg_to_bits(rd),
4185                    reg_to_bits(rd),
4186                    11,
4187                )?);
4188
4189                // Load 0x0F0F0F0F into R12
4190                bytes.extend_from_slice(&self.encode_thumb32_movw_raw(12, 0x0F0F)?);
4191                bytes.extend_from_slice(&self.encode_thumb32_movt_raw(12, 0x0F0F)?);
4192
4193                // rd = rd & R12
4194                bytes.extend_from_slice(&self.encode_thumb32_and_reg_raw(
4195                    reg_to_bits(rd),
4196                    reg_to_bits(rd),
4197                    12,
4198                )?);
4199
4200                // Step 4: x = x + (x >> 8)
4201                // R11 = rd >> 8
4202                bytes.extend_from_slice(&self.encode_thumb32_lsr_raw(11, reg_to_bits(rd), 8)?);
4203
4204                // rd = rd + R11
4205                bytes.extend_from_slice(&self.encode_thumb32_add_reg_raw(
4206                    reg_to_bits(rd),
4207                    reg_to_bits(rd),
4208                    11,
4209                )?);
4210
4211                // Step 5: x = x + (x >> 16)
4212                // R11 = rd >> 16
4213                bytes.extend_from_slice(&self.encode_thumb32_lsr_raw(11, reg_to_bits(rd), 16)?);
4214
4215                // rd = rd + R11
4216                bytes.extend_from_slice(&self.encode_thumb32_add_reg_raw(
4217                    reg_to_bits(rd),
4218                    reg_to_bits(rd),
4219                    11,
4220                )?);
4221
4222                // Step 6: return x & 0x3F
4223                // AND with 0x3F (small immediate, can use BIC or AND with immediate)
4224                bytes.extend_from_slice(&self.encode_thumb32_and_imm_raw(
4225                    reg_to_bits(rd),
4226                    reg_to_bits(rd),
4227                    0x3F,
4228                )?);
4229
4230                Ok(bytes)
4231            }
4232
4233            // I64DivU: 64-bit unsigned division using binary long division
4234            // Input: R0:R1 = dividend, R2:R3 = divisor
4235            // Output: R0:R1 = quotient
4236            // Uses: R4-R7, R12 as loop counter (avoid R8 for Renode compatibility)
4237            ArmOp::I64DivU {
4238                rdlo: _,
4239                rdhi: _,
4240                rnlo: _,
4241                rnhi: _,
4242                rmlo: _,
4243                rmhi: _,
4244            } => {
4245                let mut bytes = Vec::new();
4246
4247                // PUSH {R4-R7} - save scratch registers (NO LR — this is inline code)
4248                // 16-bit PUSH: 1011 010 M rrrrrrrr where M=0 (no LR), r=R4-R7 = 0xF0
4249                // Encoding: 1011 0100 1111 0000 = 0xB4F0
4250                bytes.extend_from_slice(&0xB4F0u16.to_le_bytes());
4251
4252                // Initialize quotient (R4:R5) = 0
4253                bytes.extend_from_slice(&0x2400u16.to_le_bytes()); // MOV R4, #0
4254                bytes.extend_from_slice(&0x2500u16.to_le_bytes()); // MOV R5, #0
4255
4256                // Initialize remainder (R6:R7) = 0
4257                bytes.extend_from_slice(&0x2600u16.to_le_bytes()); // MOV R6, #0
4258                bytes.extend_from_slice(&0x2700u16.to_le_bytes()); // MOV R7, #0
4259
4260                // Initialize loop counter R12 = 64 (use R12 scratch instead of R8)
4261                // MOV.W R12, #64: F04F 0C40
4262                bytes.extend_from_slice(&0xF04Fu16.to_le_bytes());
4263                bytes.extend_from_slice(&0x0C40u16.to_le_bytes());
4264
4265                // Loop start
4266                let loop_start = bytes.len();
4267
4268                // === Loop body: process one bit ===
4269
4270                // 1. Shift quotient R4:R5 left by 1
4271                // LSLS R5, R5, #1 (16-bit: 0000 0010 1010 1101 = 0x006D -> actually 0x002D for LSL R5,R5,#1)
4272                // LSL Rd, Rm, #imm5: 000 00 imm5 Rm Rd = 000 00 00001 101 101 = 0x006D
4273                bytes.extend_from_slice(&0x006Du16.to_le_bytes()); // LSLS R5, R5, #1
4274                // Get carry from R4 into R5: ORR R5, R5, R4 LSR #31
4275                // Thumb-2 ORR with shifted register: EA45 75D4 = ORR.W R5, R5, R4, LSR #31
4276                // 11101010 010 S Rn | 0 imm3 Rd imm2 type Rm
4277                // type=01 (LSR), imm5=31 (imm3=111, imm2=11)
4278                bytes.extend_from_slice(&0xEA45u16.to_le_bytes());
4279                bytes.extend_from_slice(&0x75D4u16.to_le_bytes()); // ORR.W R5, R5, R4, LSR #31
4280                // LSLS R4, R4, #1: 000 00 00001 100 100 = 0x0064
4281                bytes.extend_from_slice(&0x0064u16.to_le_bytes()); // LSLS R4, R4, #1
4282
4283                // 2. Shift remainder R6:R7 left by 1, OR in MSB of dividend R1
4284                // LSLS R7, R7, #1
4285                bytes.extend_from_slice(&0x007Fu16.to_le_bytes()); // LSLS R7, R7, #1
4286                // ORR.W R7, R7, R6, LSR #31
4287                bytes.extend_from_slice(&0xEA47u16.to_le_bytes());
4288                bytes.extend_from_slice(&0x77D6u16.to_le_bytes());
4289                // LSLS R6, R6, #1
4290                bytes.extend_from_slice(&0x0076u16.to_le_bytes()); // LSLS R6, R6, #1
4291                // ORR.W R6, R6, R1, LSR #31 (bring in MSB of dividend high)
4292                bytes.extend_from_slice(&0xEA46u16.to_le_bytes());
4293                bytes.extend_from_slice(&0x76D1u16.to_le_bytes());
4294
4295                // 3. Shift dividend R0:R1 left by 1
4296                // LSLS R1, R1, #1
4297                bytes.extend_from_slice(&0x0049u16.to_le_bytes()); // LSLS R1, R1, #1
4298                // ORR.W R1, R1, R0, LSR #31
4299                bytes.extend_from_slice(&0xEA41u16.to_le_bytes());
4300                bytes.extend_from_slice(&0x71D0u16.to_le_bytes());
4301                // LSLS R0, R0, #1
4302                bytes.extend_from_slice(&0x0040u16.to_le_bytes()); // LSLS R0, R0, #1
4303
4304                // 4. Compare remainder >= divisor (64-bit unsigned comparison)
4305                // Compare high words first: CMP R7, R3
4306                // CMP Rn, Rm encoding: 0x4280 | (Rm << 3) | Rn
4307                bytes.extend_from_slice(&0x429Fu16.to_le_bytes()); // CMP R7, R3 (16-bit)
4308                // BHI means R7 > R3 (unsigned) - definitely subtract
4309                // BLO means R7 < R3 - definitely don't subtract
4310                // BEQ means need to check low words
4311
4312                // If high > divisor high: branch to subtract (forward +offset)
4313                // BHI.N +6 (skip CMP, skip BLO, do subtract)
4314                // BHI: 1101 1000 offset8 where cond=1000 (HI)
4315                bytes.extend_from_slice(&0xD802u16.to_le_bytes()); // BHI +4 (to subtract block)
4316
4317                // If high < divisor high: branch past subtract
4318                // BLO.N +10 (skip to decrement)
4319                bytes.extend_from_slice(&0xD306u16.to_le_bytes()); // BLO/BCC +12 (past subtract)
4320
4321                // High words equal, compare low: CMP R6, R2
4322                bytes.extend_from_slice(&0x4296u16.to_le_bytes()); // CMP R6, R2 (16-bit)
4323                // BLO/BCC past subtract (skip SUBS+SBC.W+ORR.W = 10 bytes = 4 halfwords from PC+4)
4324                bytes.extend_from_slice(&0xD304u16.to_le_bytes()); // BCC +4 halfwords (past subtract)
4325
4326                // === Subtract block: remainder -= divisor, quotient |= 1 ===
4327                // SUBS R6, R6, R2
4328                bytes.extend_from_slice(&0x1AB6u16.to_le_bytes()); // SUBS R6, R6, R2 (16-bit)
4329                // SBC R7, R7, R3 (with borrow)
4330                // Thumb-2 SBC.W: EB67 0703 = SBC.W R7, R7, R3
4331                bytes.extend_from_slice(&0xEB67u16.to_le_bytes());
4332                bytes.extend_from_slice(&0x0703u16.to_le_bytes());
4333                // ORR R4, R4, #1 (set bit 0 of quotient low)
4334                bytes.extend_from_slice(&0xF044u16.to_le_bytes()); // ORR.W R4, R4, #1
4335                bytes.extend_from_slice(&0x0401u16.to_le_bytes());
4336
4337                // === Decrement counter and loop ===
4338                // SUBS.W R12, R12, #1 (decrement loop counter)
4339                // SUBS.W R12, R12, #1: F1BC 0C01
4340                bytes.extend_from_slice(&0xF1BCu16.to_le_bytes());
4341                bytes.extend_from_slice(&0x0C01u16.to_le_bytes());
4342
4343                // BNE back to loop_start
4344                let branch_offset_bytes = bytes.len() - loop_start + 4; // +4 for pipeline
4345                let offset_halfwords = -((branch_offset_bytes / 2) as i16);
4346                let bne_encoding = 0xD100u16 | ((offset_halfwords as u16) & 0xFF);
4347                bytes.extend_from_slice(&bne_encoding.to_le_bytes());
4348
4349                // === Loop done, move quotient to R0:R1 ===
4350                bytes.extend_from_slice(&0x4620u16.to_le_bytes()); // MOV R0, R4
4351                bytes.extend_from_slice(&0x4629u16.to_le_bytes()); // MOV R1, R5
4352
4353                // POP {R4-R7} - restore scratch registers (NO PC — inline code continues)
4354                // 16-bit POP: 1011 110 P rrrrrrrr where P=0 (no PC), r=R4-R7 = 0xF0
4355                // Encoding: 1011 1100 1111 0000 = 0xBCF0
4356                bytes.extend_from_slice(&0xBCF0u16.to_le_bytes());
4357
4358                Ok(bytes)
4359            }
4360
4361            // I64DivS: 64-bit signed division
4362            // Converts to unsigned, divides, then applies sign
4363            // Input: R0:R1 = dividend (signed), R2:R3 = divisor (signed)
4364            // Output: R0:R1 = quotient (signed)
4365            ArmOp::I64DivS {
4366                rdlo: _,
4367                rdhi: _,
4368                rnlo: _,
4369                rnhi: _,
4370                rmlo: _,
4371                rmhi: _,
4372            } => {
4373                let mut bytes = Vec::new();
4374
4375                // PUSH {R4-R11} - save scratch registers (NO LR — inline code)
4376                bytes.extend_from_slice(&0xE92Du16.to_le_bytes());
4377                bytes.extend_from_slice(&0x0FF0u16.to_le_bytes());
4378
4379                // Save result sign in R9: R9 = R1 XOR R3 (sign bit = MSB)
4380                // EOR.W R9, R1, R3
4381                bytes.extend_from_slice(&0xEA81u16.to_le_bytes());
4382                bytes.extend_from_slice(&0x0903u16.to_le_bytes());
4383
4384                // If dividend negative (R1 MSB set), negate it
4385                // TST R1, R1 (check sign)
4386                bytes.extend_from_slice(&0x4209u16.to_le_bytes()); // TST R1, R1
4387                // BPL skip_neg_dividend (+10 bytes = 5 halfwords)
4388                bytes.extend_from_slice(&0xD504u16.to_le_bytes()); // BPL +8
4389
4390                // Negate R0:R1 (64-bit): RSBS R0, R0, #0; SBC R1, R1, R1 LSL #1
4391                // Actually: MVN R0, R0; MVN R1, R1; ADDS R0, R0, #1; ADC R1, R1, #0
4392                bytes.extend_from_slice(&0x43C0u16.to_le_bytes()); // MVNS R0, R0
4393                bytes.extend_from_slice(&0x43C9u16.to_le_bytes()); // MVNS R1, R1
4394                bytes.extend_from_slice(&0x1C40u16.to_le_bytes()); // ADDS R0, R0, #1
4395                bytes.extend_from_slice(&0xF141u16.to_le_bytes()); // ADC.W R1, R1, #0
4396                bytes.extend_from_slice(&0x0100u16.to_le_bytes());
4397
4398                // If divisor negative (R3 MSB set), negate it
4399                bytes.extend_from_slice(&0x421Bu16.to_le_bytes()); // TST R3, R3
4400                bytes.extend_from_slice(&0xD504u16.to_le_bytes()); // BPL +8
4401
4402                // Negate R2:R3
4403                bytes.extend_from_slice(&0x43D2u16.to_le_bytes()); // MVNS R2, R2
4404                bytes.extend_from_slice(&0x43DBu16.to_le_bytes()); // MVNS R3, R3
4405                bytes.extend_from_slice(&0x1C52u16.to_le_bytes()); // ADDS R2, R2, #1
4406                bytes.extend_from_slice(&0xF143u16.to_le_bytes()); // ADC.W R3, R3, #0
4407                bytes.extend_from_slice(&0x0300u16.to_le_bytes());
4408
4409                // === Now do unsigned division (same as I64DivU) ===
4410                // Initialize quotient (R4:R5) = 0
4411                bytes.extend_from_slice(&0x2400u16.to_le_bytes());
4412                bytes.extend_from_slice(&0x2500u16.to_le_bytes());
4413                // Initialize remainder (R6:R7) = 0
4414                bytes.extend_from_slice(&0x2600u16.to_le_bytes());
4415                bytes.extend_from_slice(&0x2700u16.to_le_bytes());
4416                // Initialize loop counter R8 = 64
4417                bytes.extend_from_slice(&0xF04Fu16.to_le_bytes());
4418                bytes.extend_from_slice(&0x0840u16.to_le_bytes());
4419
4420                let loop_start = bytes.len();
4421
4422                // Shift quotient left
4423                bytes.extend_from_slice(&0x006Du16.to_le_bytes()); // LSLS R5, R5, #1
4424                bytes.extend_from_slice(&0xEA45u16.to_le_bytes()); // ORR.W R5, R5, R4, LSR #31
4425                bytes.extend_from_slice(&0x75D4u16.to_le_bytes());
4426                bytes.extend_from_slice(&0x0064u16.to_le_bytes()); // LSLS R4, R4, #1
4427
4428                // Shift remainder left, OR in MSB of dividend
4429                bytes.extend_from_slice(&0x007Fu16.to_le_bytes()); // LSLS R7, R7, #1
4430                bytes.extend_from_slice(&0xEA47u16.to_le_bytes()); // ORR.W R7, R7, R6, LSR #31
4431                bytes.extend_from_slice(&0x77D6u16.to_le_bytes());
4432                bytes.extend_from_slice(&0x0076u16.to_le_bytes()); // LSLS R6, R6, #1
4433                bytes.extend_from_slice(&0xEA46u16.to_le_bytes()); // ORR.W R6, R6, R1, LSR #31
4434                bytes.extend_from_slice(&0x76D1u16.to_le_bytes());
4435
4436                // Shift dividend left
4437                bytes.extend_from_slice(&0x0049u16.to_le_bytes()); // LSLS R1, R1, #1
4438                bytes.extend_from_slice(&0xEA41u16.to_le_bytes()); // ORR.W R1, R1, R0, LSR #31
4439                bytes.extend_from_slice(&0x71D0u16.to_le_bytes());
4440                bytes.extend_from_slice(&0x0040u16.to_le_bytes()); // LSLS R0, R0, #1
4441
4442                // Compare and conditionally subtract
4443                bytes.extend_from_slice(&0x429Fu16.to_le_bytes()); // CMP R7, R3
4444                bytes.extend_from_slice(&0xD802u16.to_le_bytes()); // BHI +4
4445                bytes.extend_from_slice(&0xD306u16.to_le_bytes()); // BCC +12
4446                bytes.extend_from_slice(&0x4296u16.to_le_bytes()); // CMP R6, R2
4447                bytes.extend_from_slice(&0xD304u16.to_le_bytes()); // BCC +4 halfwords
4448
4449                // Subtract and set quotient bit
4450                bytes.extend_from_slice(&0x1AB6u16.to_le_bytes()); // SUBS R6, R6, R2
4451                bytes.extend_from_slice(&0xEB67u16.to_le_bytes()); // SBC.W R7, R7, R3
4452                bytes.extend_from_slice(&0x0703u16.to_le_bytes());
4453                bytes.extend_from_slice(&0xF044u16.to_le_bytes()); // ORR.W R4, R4, #1
4454                bytes.extend_from_slice(&0x0401u16.to_le_bytes());
4455
4456                // Decrement and loop
4457                bytes.extend_from_slice(&0xF1B8u16.to_le_bytes()); // SUB.W R8, R8, #1
4458                bytes.extend_from_slice(&0x0801u16.to_le_bytes());
4459
4460                let branch_offset_bytes = bytes.len() - loop_start + 4;
4461                let offset_halfwords = -((branch_offset_bytes / 2) as i16);
4462                let bne_encoding = 0xD100u16 | ((offset_halfwords as u16) & 0xFF);
4463                bytes.extend_from_slice(&bne_encoding.to_le_bytes());
4464
4465                // Move quotient to R0:R1
4466                bytes.extend_from_slice(&0x4620u16.to_le_bytes()); // MOV R0, R4
4467                bytes.extend_from_slice(&0x4629u16.to_le_bytes()); // MOV R1, R5
4468
4469                // If result should be negative (R9 MSB set), negate R0:R1
4470                bytes.extend_from_slice(&0xF1B9u16.to_le_bytes()); // TST.W R9, R9 (check MSB)
4471                bytes.extend_from_slice(&0x0F00u16.to_le_bytes());
4472                bytes.extend_from_slice(&0xD504u16.to_le_bytes()); // BPL +8 (skip negation)
4473
4474                // Negate result R0:R1
4475                bytes.extend_from_slice(&0x43C0u16.to_le_bytes()); // MVNS R0, R0
4476                bytes.extend_from_slice(&0x43C9u16.to_le_bytes()); // MVNS R1, R1
4477                bytes.extend_from_slice(&0x1C40u16.to_le_bytes()); // ADDS R0, R0, #1
4478                bytes.extend_from_slice(&0xF141u16.to_le_bytes()); // ADC.W R1, R1, #0
4479                bytes.extend_from_slice(&0x0100u16.to_le_bytes());
4480
4481                // POP {R4-R11} - restore scratch registers (NO PC — inline code continues)
4482                bytes.extend_from_slice(&0xE8BDu16.to_le_bytes());
4483                bytes.extend_from_slice(&0x0FF0u16.to_le_bytes());
4484
4485                Ok(bytes)
4486            }
4487
4488            // I64RemU: 64-bit unsigned remainder using binary long division
4489            // Same algorithm as I64DivU but returns remainder instead of quotient
4490            // Input: R0:R1 = dividend, R2:R3 = divisor
4491            // Output: R0:R1 = remainder
4492            ArmOp::I64RemU {
4493                rdlo: _,
4494                rdhi: _,
4495                rnlo: _,
4496                rnhi: _,
4497                rmlo: _,
4498                rmhi: _,
4499            } => {
4500                let mut bytes = Vec::new();
4501
4502                // PUSH {R4-R8} - save scratch registers (NO LR — inline code)
4503                bytes.extend_from_slice(&0xE92Du16.to_le_bytes());
4504                bytes.extend_from_slice(&0x01F0u16.to_le_bytes());
4505
4506                // Initialize quotient (R4:R5) = 0 (computed but not returned)
4507                bytes.extend_from_slice(&0x2400u16.to_le_bytes());
4508                bytes.extend_from_slice(&0x2500u16.to_le_bytes());
4509                // Initialize remainder (R6:R7) = 0
4510                bytes.extend_from_slice(&0x2600u16.to_le_bytes());
4511                bytes.extend_from_slice(&0x2700u16.to_le_bytes());
4512                // Initialize loop counter R8 = 64
4513                bytes.extend_from_slice(&0xF04Fu16.to_le_bytes());
4514                bytes.extend_from_slice(&0x0840u16.to_le_bytes());
4515
4516                let loop_start = bytes.len();
4517
4518                // Shift quotient left (not needed for result, but keeps algorithm same)
4519                bytes.extend_from_slice(&0x006Du16.to_le_bytes()); // LSLS R5, R5, #1
4520                bytes.extend_from_slice(&0xEA45u16.to_le_bytes()); // ORR.W R5, R5, R4, LSR #31
4521                bytes.extend_from_slice(&0x75D4u16.to_le_bytes());
4522                bytes.extend_from_slice(&0x0064u16.to_le_bytes()); // LSLS R4, R4, #1
4523
4524                // Shift remainder left, OR in MSB of dividend
4525                bytes.extend_from_slice(&0x007Fu16.to_le_bytes()); // LSLS R7, R7, #1
4526                bytes.extend_from_slice(&0xEA47u16.to_le_bytes()); // ORR.W R7, R7, R6, LSR #31
4527                bytes.extend_from_slice(&0x77D6u16.to_le_bytes());
4528                bytes.extend_from_slice(&0x0076u16.to_le_bytes()); // LSLS R6, R6, #1
4529                bytes.extend_from_slice(&0xEA46u16.to_le_bytes()); // ORR.W R6, R6, R1, LSR #31
4530                bytes.extend_from_slice(&0x76D1u16.to_le_bytes());
4531
4532                // Shift dividend left
4533                bytes.extend_from_slice(&0x0049u16.to_le_bytes()); // LSLS R1, R1, #1
4534                bytes.extend_from_slice(&0xEA41u16.to_le_bytes()); // ORR.W R1, R1, R0, LSR #31
4535                bytes.extend_from_slice(&0x71D0u16.to_le_bytes());
4536                bytes.extend_from_slice(&0x0040u16.to_le_bytes()); // LSLS R0, R0, #1
4537
4538                // Compare and conditionally subtract
4539                bytes.extend_from_slice(&0x429Fu16.to_le_bytes()); // CMP R7, R3
4540                bytes.extend_from_slice(&0xD802u16.to_le_bytes()); // BHI +4
4541                bytes.extend_from_slice(&0xD306u16.to_le_bytes()); // BCC +12
4542                bytes.extend_from_slice(&0x4296u16.to_le_bytes()); // CMP R6, R2
4543                bytes.extend_from_slice(&0xD304u16.to_le_bytes()); // BCC +4 halfwords
4544
4545                // Subtract and set quotient bit
4546                bytes.extend_from_slice(&0x1AB6u16.to_le_bytes()); // SUBS R6, R6, R2
4547                bytes.extend_from_slice(&0xEB67u16.to_le_bytes()); // SBC.W R7, R7, R3
4548                bytes.extend_from_slice(&0x0703u16.to_le_bytes());
4549                bytes.extend_from_slice(&0xF044u16.to_le_bytes()); // ORR.W R4, R4, #1
4550                bytes.extend_from_slice(&0x0401u16.to_le_bytes());
4551
4552                // Decrement and loop
4553                bytes.extend_from_slice(&0xF1B8u16.to_le_bytes()); // SUB.W R8, R8, #1
4554                bytes.extend_from_slice(&0x0801u16.to_le_bytes());
4555
4556                let branch_offset_bytes = bytes.len() - loop_start + 4;
4557                let offset_halfwords = -((branch_offset_bytes / 2) as i16);
4558                let bne_encoding = 0xD100u16 | ((offset_halfwords as u16) & 0xFF);
4559                bytes.extend_from_slice(&bne_encoding.to_le_bytes());
4560
4561                // Move REMAINDER to R0:R1 (difference from I64DivU)
4562                bytes.extend_from_slice(&0x4630u16.to_le_bytes()); // MOV R0, R6
4563                bytes.extend_from_slice(&0x4639u16.to_le_bytes()); // MOV R1, R7
4564
4565                // POP {R4-R8} - restore scratch registers (NO PC — inline code continues)
4566                bytes.extend_from_slice(&0xE8BDu16.to_le_bytes());
4567                bytes.extend_from_slice(&0x01F0u16.to_le_bytes());
4568
4569                Ok(bytes)
4570            }
4571
4572            // I64RemS: 64-bit signed remainder
4573            // Remainder sign follows dividend sign (not quotient rule)
4574            // Input: R0:R1 = dividend (signed), R2:R3 = divisor (signed)
4575            // Output: R0:R1 = remainder (signed, same sign as dividend)
4576            ArmOp::I64RemS {
4577                rdlo: _,
4578                rdhi: _,
4579                rnlo: _,
4580                rnhi: _,
4581                rmlo: _,
4582                rmhi: _,
4583            } => {
4584                let mut bytes = Vec::new();
4585
4586                // PUSH {R4-R11} - save scratch registers (NO LR — inline code)
4587                bytes.extend_from_slice(&0xE92Du16.to_le_bytes());
4588                bytes.extend_from_slice(&0x0FF0u16.to_le_bytes());
4589
4590                // Save dividend sign in R9 (remainder sign = dividend sign)
4591                // MOV R9, R1 (just need the sign bit)
4592                bytes.extend_from_slice(&0x4689u16.to_le_bytes()); // MOV R9, R1
4593
4594                // If dividend negative (R1 MSB set), negate it
4595                bytes.extend_from_slice(&0x4209u16.to_le_bytes()); // TST R1, R1
4596                bytes.extend_from_slice(&0xD504u16.to_le_bytes()); // BPL +8
4597
4598                // Negate R0:R1
4599                bytes.extend_from_slice(&0x43C0u16.to_le_bytes()); // MVNS R0, R0
4600                bytes.extend_from_slice(&0x43C9u16.to_le_bytes()); // MVNS R1, R1
4601                bytes.extend_from_slice(&0x1C40u16.to_le_bytes()); // ADDS R0, R0, #1
4602                bytes.extend_from_slice(&0xF141u16.to_le_bytes()); // ADC.W R1, R1, #0
4603                bytes.extend_from_slice(&0x0100u16.to_le_bytes());
4604
4605                // If divisor negative (R3 MSB set), negate it
4606                bytes.extend_from_slice(&0x421Bu16.to_le_bytes()); // TST R3, R3
4607                bytes.extend_from_slice(&0xD504u16.to_le_bytes()); // BPL +8
4608
4609                // Negate R2:R3
4610                bytes.extend_from_slice(&0x43D2u16.to_le_bytes()); // MVNS R2, R2
4611                bytes.extend_from_slice(&0x43DBu16.to_le_bytes()); // MVNS R3, R3
4612                bytes.extend_from_slice(&0x1C52u16.to_le_bytes()); // ADDS R2, R2, #1
4613                bytes.extend_from_slice(&0xF143u16.to_le_bytes()); // ADC.W R3, R3, #0
4614                bytes.extend_from_slice(&0x0300u16.to_le_bytes());
4615
4616                // === Unsigned division algorithm ===
4617                // Initialize quotient (R4:R5) = 0
4618                bytes.extend_from_slice(&0x2400u16.to_le_bytes());
4619                bytes.extend_from_slice(&0x2500u16.to_le_bytes());
4620                // Initialize remainder (R6:R7) = 0
4621                bytes.extend_from_slice(&0x2600u16.to_le_bytes());
4622                bytes.extend_from_slice(&0x2700u16.to_le_bytes());
4623                // Initialize loop counter R8 = 64
4624                bytes.extend_from_slice(&0xF04Fu16.to_le_bytes());
4625                bytes.extend_from_slice(&0x0840u16.to_le_bytes());
4626
4627                let loop_start = bytes.len();
4628
4629                // Shift quotient left
4630                bytes.extend_from_slice(&0x006Du16.to_le_bytes()); // LSLS R5, R5, #1
4631                bytes.extend_from_slice(&0xEA45u16.to_le_bytes()); // ORR.W R5, R5, R4, LSR #31
4632                bytes.extend_from_slice(&0x75D4u16.to_le_bytes());
4633                bytes.extend_from_slice(&0x0064u16.to_le_bytes()); // LSLS R4, R4, #1
4634
4635                // Shift remainder left, OR in MSB of dividend
4636                bytes.extend_from_slice(&0x007Fu16.to_le_bytes()); // LSLS R7, R7, #1
4637                bytes.extend_from_slice(&0xEA47u16.to_le_bytes()); // ORR.W R7, R7, R6, LSR #31
4638                bytes.extend_from_slice(&0x77D6u16.to_le_bytes());
4639                bytes.extend_from_slice(&0x0076u16.to_le_bytes()); // LSLS R6, R6, #1
4640                bytes.extend_from_slice(&0xEA46u16.to_le_bytes()); // ORR.W R6, R6, R1, LSR #31
4641                bytes.extend_from_slice(&0x76D1u16.to_le_bytes());
4642
4643                // Shift dividend left
4644                bytes.extend_from_slice(&0x0049u16.to_le_bytes()); // LSLS R1, R1, #1
4645                bytes.extend_from_slice(&0xEA41u16.to_le_bytes()); // ORR.W R1, R1, R0, LSR #31
4646                bytes.extend_from_slice(&0x71D0u16.to_le_bytes());
4647                bytes.extend_from_slice(&0x0040u16.to_le_bytes()); // LSLS R0, R0, #1
4648
4649                // Compare and conditionally subtract
4650                bytes.extend_from_slice(&0x429Fu16.to_le_bytes()); // CMP R7, R3
4651                bytes.extend_from_slice(&0xD802u16.to_le_bytes()); // BHI +4
4652                bytes.extend_from_slice(&0xD306u16.to_le_bytes()); // BCC +12
4653                bytes.extend_from_slice(&0x4296u16.to_le_bytes()); // CMP R6, R2
4654                bytes.extend_from_slice(&0xD304u16.to_le_bytes()); // BCC +4 halfwords
4655
4656                // Subtract and set quotient bit
4657                bytes.extend_from_slice(&0x1AB6u16.to_le_bytes()); // SUBS R6, R6, R2
4658                bytes.extend_from_slice(&0xEB67u16.to_le_bytes()); // SBC.W R7, R7, R3
4659                bytes.extend_from_slice(&0x0703u16.to_le_bytes());
4660                bytes.extend_from_slice(&0xF044u16.to_le_bytes()); // ORR.W R4, R4, #1
4661                bytes.extend_from_slice(&0x0401u16.to_le_bytes());
4662
4663                // Decrement and loop
4664                bytes.extend_from_slice(&0xF1B8u16.to_le_bytes()); // SUB.W R8, R8, #1
4665                bytes.extend_from_slice(&0x0801u16.to_le_bytes());
4666
4667                let branch_offset_bytes = bytes.len() - loop_start + 4;
4668                let offset_halfwords = -((branch_offset_bytes / 2) as i16);
4669                let bne_encoding = 0xD100u16 | ((offset_halfwords as u16) & 0xFF);
4670                bytes.extend_from_slice(&bne_encoding.to_le_bytes());
4671
4672                // Move remainder to R0:R1
4673                bytes.extend_from_slice(&0x4630u16.to_le_bytes()); // MOV R0, R6
4674                bytes.extend_from_slice(&0x4639u16.to_le_bytes()); // MOV R1, R7
4675
4676                // If original dividend was negative (R9 MSB set), negate remainder
4677                bytes.extend_from_slice(&0xF1B9u16.to_le_bytes()); // TST.W R9, R9
4678                bytes.extend_from_slice(&0x0F00u16.to_le_bytes());
4679                bytes.extend_from_slice(&0xD504u16.to_le_bytes()); // BPL +8
4680
4681                // Negate result R0:R1
4682                bytes.extend_from_slice(&0x43C0u16.to_le_bytes()); // MVNS R0, R0
4683                bytes.extend_from_slice(&0x43C9u16.to_le_bytes()); // MVNS R1, R1
4684                bytes.extend_from_slice(&0x1C40u16.to_le_bytes()); // ADDS R0, R0, #1
4685                bytes.extend_from_slice(&0xF141u16.to_le_bytes()); // ADC.W R1, R1, #0
4686                bytes.extend_from_slice(&0x0100u16.to_le_bytes());
4687
4688                // POP {R4-R11} - restore scratch registers (NO PC — inline code continues)
4689                bytes.extend_from_slice(&0xE8BDu16.to_le_bytes());
4690                bytes.extend_from_slice(&0x0FF0u16.to_le_bytes());
4691
4692                Ok(bytes)
4693            }
4694
4695            // === F32 VFP single-precision Thumb-2 encodings ===
4696            // VFP instruction words are identical to ARM32; emit as two LE halfwords.
4697            ArmOp::F32Add { sd, sn, sm } => {
4698                Ok(vfp_to_thumb_bytes(encode_vfp_3reg(0xEE300A00, sd, sn, sm)?))
4699            }
4700            ArmOp::F32Sub { sd, sn, sm } => {
4701                Ok(vfp_to_thumb_bytes(encode_vfp_3reg(0xEE300A40, sd, sn, sm)?))
4702            }
4703            ArmOp::F32Mul { sd, sn, sm } => {
4704                Ok(vfp_to_thumb_bytes(encode_vfp_3reg(0xEE200A00, sd, sn, sm)?))
4705            }
4706            ArmOp::F32Div { sd, sn, sm } => {
4707                Ok(vfp_to_thumb_bytes(encode_vfp_3reg(0xEE800A00, sd, sn, sm)?))
4708            }
4709            ArmOp::F32Abs { sd, sm } => {
4710                Ok(vfp_to_thumb_bytes(encode_vfp_2reg(0xEEB00AC0, sd, sm)?))
4711            }
4712            ArmOp::F32Neg { sd, sm } => {
4713                Ok(vfp_to_thumb_bytes(encode_vfp_2reg(0xEEB10A40, sd, sm)?))
4714            }
4715            ArmOp::F32Sqrt { sd, sm } => {
4716                Ok(vfp_to_thumb_bytes(encode_vfp_2reg(0xEEB10AC0, sd, sm)?))
4717            }
4718
4719            // f32 pseudo-ops — multi-instruction sequences
4720            // FPSCR RMode: 00=nearest, 01=+inf(ceil), 10=-inf(floor), 11=zero(trunc)
4721            ArmOp::F32Ceil { sd, sm } => self.encode_thumb_f32_rounding(sd, sm, 0b01),
4722            ArmOp::F32Floor { sd, sm } => self.encode_thumb_f32_rounding(sd, sm, 0b10),
4723            ArmOp::F32Trunc { sd, sm } => self.encode_thumb_f32_rounding(sd, sm, 0b11),
4724            ArmOp::F32Nearest { sd, sm } => self.encode_thumb_f32_rounding(sd, sm, 0b00),
4725            ArmOp::F32Min { sd, sn, sm } => self.encode_thumb_f32_minmax(sd, sn, sm, true),
4726            ArmOp::F32Max { sd, sn, sm } => self.encode_thumb_f32_minmax(sd, sn, sm, false),
4727            ArmOp::F32Copysign { sd, sn, sm } => self.encode_thumb_f32_copysign(sd, sn, sm),
4728
4729            // f32 comparisons — VCMP + VMRS + MOV #0 + IT + MOV #1
4730            ArmOp::F32Eq { rd, sn, sm } => self.encode_thumb_f32_compare(rd, sn, sm, 0x0),
4731            ArmOp::F32Ne { rd, sn, sm } => self.encode_thumb_f32_compare(rd, sn, sm, 0x1),
4732            ArmOp::F32Lt { rd, sn, sm } => self.encode_thumb_f32_compare(rd, sn, sm, 0x4),
4733            ArmOp::F32Le { rd, sn, sm } => self.encode_thumb_f32_compare(rd, sn, sm, 0x9),
4734            ArmOp::F32Gt { rd, sn, sm } => self.encode_thumb_f32_compare(rd, sn, sm, 0xC),
4735            ArmOp::F32Ge { rd, sn, sm } => self.encode_thumb_f32_compare(rd, sn, sm, 0xA),
4736
4737            ArmOp::F32Const { sd, value } => self.encode_thumb_f32_const(sd, *value),
4738
4739            ArmOp::F32Load { sd, addr } => {
4740                Ok(vfp_to_thumb_bytes(encode_vfp_ldst(0xED900A00, sd, addr)?))
4741            }
4742            ArmOp::F32Store { sd, addr } => {
4743                Ok(vfp_to_thumb_bytes(encode_vfp_ldst(0xED800A00, sd, addr)?))
4744            }
4745
4746            ArmOp::F32ConvertI32S { sd, rm } => self.encode_thumb_f32_convert_i32(sd, rm, true),
4747            ArmOp::F32ConvertI32U { sd, rm } => self.encode_thumb_f32_convert_i32(sd, rm, false),
4748            ArmOp::F32ConvertI64S { .. } | ArmOp::F32ConvertI64U { .. } => {
4749                Err(synth_core::Error::synthesis(
4750                    "F32 i64 conversion not supported (requires register pairs on 32-bit ARM)",
4751                ))
4752            }
4753            ArmOp::F32ReinterpretI32 { sd, rm } => {
4754                Ok(vfp_to_thumb_bytes(encode_vmov_core_sreg(true, sd, rm)?))
4755            }
4756            ArmOp::I32ReinterpretF32 { rd, sm } => {
4757                Ok(vfp_to_thumb_bytes(encode_vmov_core_sreg(false, sm, rd)?))
4758            }
4759            ArmOp::I32TruncF32S { rd, sm } => self.encode_thumb_i32_trunc_f32(rd, sm, true),
4760            ArmOp::I32TruncF32U { rd, sm } => self.encode_thumb_i32_trunc_f32(rd, sm, false),
4761
4762            // === F64 VFP double-precision Thumb-2 encodings ===
4763            // VFP instruction words are identical to ARM32; emit as two LE halfwords.
4764            ArmOp::F64Add { dd, dn, dm } => Ok(vfp_to_thumb_bytes(encode_vfp_3reg_f64(
4765                0xEE300B00, dd, dn, dm,
4766            )?)),
4767            ArmOp::F64Sub { dd, dn, dm } => Ok(vfp_to_thumb_bytes(encode_vfp_3reg_f64(
4768                0xEE300B40, dd, dn, dm,
4769            )?)),
4770            ArmOp::F64Mul { dd, dn, dm } => Ok(vfp_to_thumb_bytes(encode_vfp_3reg_f64(
4771                0xEE200B00, dd, dn, dm,
4772            )?)),
4773            ArmOp::F64Div { dd, dn, dm } => Ok(vfp_to_thumb_bytes(encode_vfp_3reg_f64(
4774                0xEE800B00, dd, dn, dm,
4775            )?)),
4776            ArmOp::F64Abs { dd, dm } => {
4777                Ok(vfp_to_thumb_bytes(encode_vfp_2reg_f64(0xEEB00BC0, dd, dm)?))
4778            }
4779            ArmOp::F64Neg { dd, dm } => {
4780                Ok(vfp_to_thumb_bytes(encode_vfp_2reg_f64(0xEEB10B40, dd, dm)?))
4781            }
4782            ArmOp::F64Sqrt { dd, dm } => {
4783                Ok(vfp_to_thumb_bytes(encode_vfp_2reg_f64(0xEEB10BC0, dd, dm)?))
4784            }
4785
4786            // f64 pseudo-ops
4787            // FPSCR RMode: 00=nearest, 01=+inf(ceil), 10=-inf(floor), 11=zero(trunc)
4788            ArmOp::F64Ceil { dd, dm } => self.encode_thumb_f64_rounding(dd, dm, 0b01),
4789            ArmOp::F64Floor { dd, dm } => self.encode_thumb_f64_rounding(dd, dm, 0b10),
4790            ArmOp::F64Trunc { dd, dm } => self.encode_thumb_f64_rounding(dd, dm, 0b11),
4791            ArmOp::F64Nearest { dd, dm } => self.encode_thumb_f64_rounding(dd, dm, 0b00),
4792            ArmOp::F64Min { dd, dn, dm } => self.encode_thumb_f64_minmax(dd, dn, dm, true),
4793            ArmOp::F64Max { dd, dn, dm } => self.encode_thumb_f64_minmax(dd, dn, dm, false),
4794            ArmOp::F64Copysign { dd, dn, dm } => self.encode_thumb_f64_copysign(dd, dn, dm),
4795
4796            // f64 comparisons
4797            ArmOp::F64Eq { rd, dn, dm } => self.encode_thumb_f64_compare(rd, dn, dm, 0x0),
4798            ArmOp::F64Ne { rd, dn, dm } => self.encode_thumb_f64_compare(rd, dn, dm, 0x1),
4799            ArmOp::F64Lt { rd, dn, dm } => self.encode_thumb_f64_compare(rd, dn, dm, 0x4),
4800            ArmOp::F64Le { rd, dn, dm } => self.encode_thumb_f64_compare(rd, dn, dm, 0x9),
4801            ArmOp::F64Gt { rd, dn, dm } => self.encode_thumb_f64_compare(rd, dn, dm, 0xC),
4802            ArmOp::F64Ge { rd, dn, dm } => self.encode_thumb_f64_compare(rd, dn, dm, 0xA),
4803
4804            ArmOp::F64Const { dd, value } => self.encode_thumb_f64_const(dd, *value),
4805
4806            ArmOp::F64Load { dd, addr } => Ok(vfp_to_thumb_bytes(encode_vfp_ldst_f64(
4807                0xED900B00, dd, addr,
4808            )?)),
4809            ArmOp::F64Store { dd, addr } => Ok(vfp_to_thumb_bytes(encode_vfp_ldst_f64(
4810                0xED800B00, dd, addr,
4811            )?)),
4812
4813            ArmOp::F64ConvertI32S { dd, rm } => self.encode_thumb_f64_convert_i32(dd, rm, true),
4814            ArmOp::F64ConvertI32U { dd, rm } => self.encode_thumb_f64_convert_i32(dd, rm, false),
4815            ArmOp::F64ConvertI64S { .. } | ArmOp::F64ConvertI64U { .. } => {
4816                Err(synth_core::Error::synthesis(
4817                    "F64 i64 conversion not supported (requires register pairs on 32-bit ARM)",
4818                ))
4819            }
4820            ArmOp::F64PromoteF32 { dd, sm } => self.encode_thumb_f64_promote_f32(dd, sm),
4821            ArmOp::F64ReinterpretI64 { dd, rmlo, rmhi } => Ok(vfp_to_thumb_bytes(
4822                encode_vmov_core_dreg(true, dd, rmlo, rmhi)?,
4823            )),
4824            ArmOp::I64ReinterpretF64 { rdlo, rdhi, dm } => Ok(vfp_to_thumb_bytes(
4825                encode_vmov_core_dreg(false, dm, rdlo, rdhi)?,
4826            )),
4827            ArmOp::I64TruncF64S { .. } | ArmOp::I64TruncF64U { .. } => {
4828                Err(synth_core::Error::synthesis(
4829                    "i64 truncation from F64 not supported (requires i64 register pairs on 32-bit ARM)",
4830                ))
4831            }
4832            ArmOp::I32TruncF64S { rd, dm } => self.encode_thumb_i32_trunc_f64(rd, dm, true),
4833            ArmOp::I32TruncF64U { rd, dm } => self.encode_thumb_i32_trunc_f64(rd, dm, false),
4834
4835            // ===== i64 operations: encode as multi-instruction Thumb-2 sequences =====
4836
4837            // I64Add: ADDS rdlo, rnlo, rmlo; ADC.W rdhi, rnhi, rmhi
4838            ArmOp::I64Add {
4839                rdlo,
4840                rdhi,
4841                rnlo,
4842                rnhi,
4843                rmlo,
4844                rmhi,
4845            } => {
4846                let mut bytes = Vec::new();
4847                // ADDS rdlo, rnlo, rmlo (16-bit)
4848                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Adds {
4849                    rd: *rdlo,
4850                    rn: *rnlo,
4851                    op2: Operand2::Reg(*rmlo),
4852                })?);
4853                // ADC.W rdhi, rnhi, rmhi (32-bit)
4854                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Adc {
4855                    rd: *rdhi,
4856                    rn: *rnhi,
4857                    op2: Operand2::Reg(*rmhi),
4858                })?);
4859                Ok(bytes)
4860            }
4861
4862            // I64Sub: SUBS rdlo, rnlo, rmlo; SBC.W rdhi, rnhi, rmhi
4863            ArmOp::I64Sub {
4864                rdlo,
4865                rdhi,
4866                rnlo,
4867                rnhi,
4868                rmlo,
4869                rmhi,
4870            } => {
4871                let mut bytes = Vec::new();
4872                // SUBS rdlo, rnlo, rmlo (16-bit)
4873                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Subs {
4874                    rd: *rdlo,
4875                    rn: *rnlo,
4876                    op2: Operand2::Reg(*rmlo),
4877                })?);
4878                // SBC.W rdhi, rnhi, rmhi (32-bit)
4879                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Sbc {
4880                    rd: *rdhi,
4881                    rn: *rnhi,
4882                    op2: Operand2::Reg(*rmhi),
4883                })?);
4884                Ok(bytes)
4885            }
4886
4887            // I64And: AND rdlo, rnlo, rmlo; AND rdhi, rnhi, rmhi
4888            ArmOp::I64And {
4889                rdlo,
4890                rdhi,
4891                rnlo,
4892                rnhi,
4893                rmlo,
4894                rmhi,
4895            } => {
4896                let mut bytes = Vec::new();
4897                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::And {
4898                    rd: *rdlo,
4899                    rn: *rnlo,
4900                    op2: Operand2::Reg(*rmlo),
4901                })?);
4902                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::And {
4903                    rd: *rdhi,
4904                    rn: *rnhi,
4905                    op2: Operand2::Reg(*rmhi),
4906                })?);
4907                Ok(bytes)
4908            }
4909
4910            // I64Or: ORR rdlo, rnlo, rmlo; ORR rdhi, rnhi, rmhi
4911            ArmOp::I64Or {
4912                rdlo,
4913                rdhi,
4914                rnlo,
4915                rnhi,
4916                rmlo,
4917                rmhi,
4918            } => {
4919                let mut bytes = Vec::new();
4920                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Orr {
4921                    rd: *rdlo,
4922                    rn: *rnlo,
4923                    op2: Operand2::Reg(*rmlo),
4924                })?);
4925                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Orr {
4926                    rd: *rdhi,
4927                    rn: *rnhi,
4928                    op2: Operand2::Reg(*rmhi),
4929                })?);
4930                Ok(bytes)
4931            }
4932
4933            // I64Xor: EOR rdlo, rnlo, rmlo; EOR rdhi, rnhi, rmhi
4934            ArmOp::I64Xor {
4935                rdlo,
4936                rdhi,
4937                rnlo,
4938                rnhi,
4939                rmlo,
4940                rmhi,
4941            } => {
4942                let mut bytes = Vec::new();
4943                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Eor {
4944                    rd: *rdlo,
4945                    rn: *rnlo,
4946                    op2: Operand2::Reg(*rmlo),
4947                })?);
4948                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Eor {
4949                    rd: *rdhi,
4950                    rn: *rnhi,
4951                    op2: Operand2::Reg(*rmhi),
4952                })?);
4953                Ok(bytes)
4954            }
4955
4956            // I64Eqz: ORR scratch, lo, hi; ITE EQ; MOV rd, #1; MOV rd, #0
4957            ArmOp::I64Eqz { rd, rnlo, rnhi } => self.encode_thumb(&ArmOp::I64SetCondZ {
4958                rd: *rd,
4959                rn_lo: *rnlo,
4960                rn_hi: *rnhi,
4961            }),
4962
4963            // I64 comparisons: delegate to I64SetCond
4964            ArmOp::I64Eq {
4965                rd,
4966                rnlo,
4967                rnhi,
4968                rmlo,
4969                rmhi,
4970            } => self.encode_thumb(&ArmOp::I64SetCond {
4971                rd: *rd,
4972                rn_lo: *rnlo,
4973                rn_hi: *rnhi,
4974                rm_lo: *rmlo,
4975                rm_hi: *rmhi,
4976                cond: synth_synthesis::Condition::EQ,
4977            }),
4978
4979            ArmOp::I64Ne {
4980                rd,
4981                rnlo,
4982                rnhi,
4983                rmlo,
4984                rmhi,
4985            } => self.encode_thumb(&ArmOp::I64SetCond {
4986                rd: *rd,
4987                rn_lo: *rnlo,
4988                rn_hi: *rnhi,
4989                rm_lo: *rmlo,
4990                rm_hi: *rmhi,
4991                cond: synth_synthesis::Condition::NE,
4992            }),
4993
4994            ArmOp::I64LtS {
4995                rd,
4996                rnlo,
4997                rnhi,
4998                rmlo,
4999                rmhi,
5000            } => self.encode_thumb(&ArmOp::I64SetCond {
5001                rd: *rd,
5002                rn_lo: *rnlo,
5003                rn_hi: *rnhi,
5004                rm_lo: *rmlo,
5005                rm_hi: *rmhi,
5006                cond: synth_synthesis::Condition::LT,
5007            }),
5008
5009            ArmOp::I64LtU {
5010                rd,
5011                rnlo,
5012                rnhi,
5013                rmlo,
5014                rmhi,
5015            } => self.encode_thumb(&ArmOp::I64SetCond {
5016                rd: *rd,
5017                rn_lo: *rnlo,
5018                rn_hi: *rnhi,
5019                rm_lo: *rmlo,
5020                rm_hi: *rmhi,
5021                cond: synth_synthesis::Condition::LO,
5022            }),
5023
5024            ArmOp::I64LeS {
5025                rd,
5026                rnlo,
5027                rnhi,
5028                rmlo,
5029                rmhi,
5030            } => self.encode_thumb(&ArmOp::I64SetCond {
5031                rd: *rd,
5032                rn_lo: *rnlo,
5033                rn_hi: *rnhi,
5034                rm_lo: *rmlo,
5035                rm_hi: *rmhi,
5036                cond: synth_synthesis::Condition::LE,
5037            }),
5038
5039            ArmOp::I64LeU {
5040                rd,
5041                rnlo,
5042                rnhi,
5043                rmlo,
5044                rmhi,
5045            } => self.encode_thumb(&ArmOp::I64SetCond {
5046                rd: *rd,
5047                rn_lo: *rnlo,
5048                rn_hi: *rnhi,
5049                rm_lo: *rmlo,
5050                rm_hi: *rmhi,
5051                cond: synth_synthesis::Condition::LS,
5052            }),
5053
5054            ArmOp::I64GtS {
5055                rd,
5056                rnlo,
5057                rnhi,
5058                rmlo,
5059                rmhi,
5060            } => self.encode_thumb(&ArmOp::I64SetCond {
5061                rd: *rd,
5062                rn_lo: *rnlo,
5063                rn_hi: *rnhi,
5064                rm_lo: *rmlo,
5065                rm_hi: *rmhi,
5066                cond: synth_synthesis::Condition::GT,
5067            }),
5068
5069            ArmOp::I64GtU {
5070                rd,
5071                rnlo,
5072                rnhi,
5073                rmlo,
5074                rmhi,
5075            } => self.encode_thumb(&ArmOp::I64SetCond {
5076                rd: *rd,
5077                rn_lo: *rnlo,
5078                rn_hi: *rnhi,
5079                rm_lo: *rmlo,
5080                rm_hi: *rmhi,
5081                cond: synth_synthesis::Condition::HI,
5082            }),
5083
5084            ArmOp::I64GeS {
5085                rd,
5086                rnlo,
5087                rnhi,
5088                rmlo,
5089                rmhi,
5090            } => self.encode_thumb(&ArmOp::I64SetCond {
5091                rd: *rd,
5092                rn_lo: *rnlo,
5093                rn_hi: *rnhi,
5094                rm_lo: *rmlo,
5095                rm_hi: *rmhi,
5096                cond: synth_synthesis::Condition::GE,
5097            }),
5098
5099            ArmOp::I64GeU {
5100                rd,
5101                rnlo,
5102                rnhi,
5103                rmlo,
5104                rmhi,
5105            } => self.encode_thumb(&ArmOp::I64SetCond {
5106                rd: *rd,
5107                rn_lo: *rnlo,
5108                rn_hi: *rnhi,
5109                rm_lo: *rmlo,
5110                rm_hi: *rmhi,
5111                cond: synth_synthesis::Condition::HS,
5112            }),
5113
5114            // I64Const: MOVW rdlo, lo16; MOVT rdlo, hi16; MOVW rdhi, lo16_hi; MOVT rdhi, hi16_hi
5115            ArmOp::I64Const { rdlo, rdhi, value } => {
5116                let lo32 = *value as u32;
5117                let hi32 = (*value >> 32) as u32;
5118                let mut bytes = Vec::new();
5119                // Load low 32 bits into rdlo
5120                bytes.extend_from_slice(
5121                    &self.encode_thumb32_movw_raw(reg_to_bits(rdlo), lo32 & 0xFFFF)?,
5122                );
5123                if lo32 > 0xFFFF {
5124                    bytes.extend_from_slice(
5125                        &self.encode_thumb32_movt_raw(reg_to_bits(rdlo), lo32 >> 16)?,
5126                    );
5127                }
5128                // Load high 32 bits into rdhi
5129                bytes.extend_from_slice(
5130                    &self.encode_thumb32_movw_raw(reg_to_bits(rdhi), hi32 & 0xFFFF)?,
5131                );
5132                if hi32 > 0xFFFF {
5133                    bytes.extend_from_slice(
5134                        &self.encode_thumb32_movt_raw(reg_to_bits(rdhi), hi32 >> 16)?,
5135                    );
5136                }
5137                Ok(bytes)
5138            }
5139
5140            // I64Ldr: LDR rdlo, [base, offset]; LDR rdhi, [base, offset+4]
5141            ArmOp::I64Ldr { rdlo, rdhi, addr } => {
5142                let mut bytes = Vec::new();
5143                let offset = if addr.offset < 0 {
5144                    0u32
5145                } else {
5146                    addr.offset as u32
5147                };
5148                bytes.extend_from_slice(&self.encode_thumb32_ldr(rdlo, &addr.base, offset)?);
5149                bytes.extend_from_slice(&self.encode_thumb32_ldr(
5150                    rdhi,
5151                    &addr.base,
5152                    offset.wrapping_add(4),
5153                )?);
5154                Ok(bytes)
5155            }
5156
5157            // I64Str: STR rdlo, [base, offset]; STR rdhi, [base, offset+4]
5158            ArmOp::I64Str { rdlo, rdhi, addr } => {
5159                let mut bytes = Vec::new();
5160                let offset = if addr.offset < 0 {
5161                    0u32
5162                } else {
5163                    addr.offset as u32
5164                };
5165                bytes.extend_from_slice(&self.encode_thumb32_str(rdlo, &addr.base, offset)?);
5166                bytes.extend_from_slice(&self.encode_thumb32_str(
5167                    rdhi,
5168                    &addr.base,
5169                    offset.wrapping_add(4),
5170                )?);
5171                Ok(bytes)
5172            }
5173
5174            // I64ExtendI32S: MOV rdlo, rn; ASR rdhi, rdlo, #31 (sign-extend)
5175            ArmOp::I64ExtendI32S { rdlo, rdhi, rn } => {
5176                let mut bytes = Vec::new();
5177                if rdlo != rn {
5178                    // MOV rdlo, rn (16-bit)
5179                    bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Mov {
5180                        rd: *rdlo,
5181                        op2: Operand2::Reg(*rn),
5182                    })?);
5183                }
5184                // ASR rdhi, rdlo, #31 (sign-extend: fill high word with sign bit)
5185                bytes.extend_from_slice(
5186                    &self.encode_thumb32_shift(rdhi, rdlo, 31, 0b10)?, // ASR type
5187                );
5188                Ok(bytes)
5189            }
5190
5191            // I64ExtendI32U: MOV rdlo, rn; MOV rdhi, #0
5192            ArmOp::I64ExtendI32U { rdlo, rdhi, rn } => {
5193                let mut bytes = Vec::new();
5194                if rdlo != rn {
5195                    // MOV rdlo, rn
5196                    bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Mov {
5197                        rd: *rdlo,
5198                        op2: Operand2::Reg(*rn),
5199                    })?);
5200                }
5201                // MOV rdhi, #0 (16-bit: MOVS Rd, #0)
5202                let rdhi_bits = reg_to_bits(rdhi) as u16;
5203                let instr: u16 = 0x2000 | (rdhi_bits << 8);
5204                bytes.extend_from_slice(&instr.to_le_bytes());
5205                Ok(bytes)
5206            }
5207
5208            // I32WrapI64: MOV rd, rnlo (just take low 32 bits)
5209            ArmOp::I32WrapI64 { rd, rnlo } => {
5210                if rd == rnlo {
5211                    // No-op: already in the right register
5212                    let instr: u16 = 0xBF00; // NOP
5213                    Ok(instr.to_le_bytes().to_vec())
5214                } else {
5215                    // MOV rd, rnlo
5216                    self.encode_thumb(&ArmOp::Mov {
5217                        rd: *rd,
5218                        op2: Operand2::Reg(*rnlo),
5219                    })
5220                }
5221            }
5222
5223            // ===== Helium MVE operations (Thumb-2 encoding) =====
5224            ArmOp::MveLoad { qd, addr } => Ok(vfp_to_thumb_bytes(encode_mve_vldrw(qd, addr))),
5225            ArmOp::MveStore { qd, addr } => Ok(vfp_to_thumb_bytes(encode_mve_vstrw(qd, addr))),
5226            ArmOp::MveConst { qd, bytes } => self.encode_thumb_mve_const(qd, bytes),
5227            ArmOp::MveAnd { qd, qn, qm } => Ok(vfp_to_thumb_bytes(encode_mve_3reg_bitwise(
5228                0xEF000150, qd, qn, qm,
5229            ))),
5230            ArmOp::MveOrr { qd, qn, qm } => Ok(vfp_to_thumb_bytes(encode_mve_3reg_bitwise(
5231                0xEF200150, qd, qn, qm,
5232            ))),
5233            ArmOp::MveEor { qd, qn, qm } => Ok(vfp_to_thumb_bytes(encode_mve_3reg_bitwise(
5234                0xFF000150, qd, qn, qm,
5235            ))),
5236            ArmOp::MveMvn { qd, qm } => {
5237                // VMVN Qd, Qm: 0xFFB005C0 | Qd<<12 | Qm
5238                let qd_enc = qreg_to_num(qd);
5239                let qm_enc = qreg_to_num(qm);
5240                let instr: u32 = 0xFFB005C0 | ((qd_enc * 2) << 12) | (qm_enc * 2);
5241                Ok(vfp_to_thumb_bytes(instr))
5242            }
5243            ArmOp::MveBic { qd, qn, qm } => Ok(vfp_to_thumb_bytes(encode_mve_3reg_bitwise(
5244                0xEF100150, qd, qn, qm,
5245            ))),
5246            ArmOp::MveAddI { qd, qn, qm, size } => {
5247                let sz = mve_size_bits(size);
5248                let base: u32 = 0xEF000840 | (sz << 20);
5249                Ok(vfp_to_thumb_bytes(encode_mve_3reg(base, qd, qn, qm)))
5250            }
5251            ArmOp::MveSubI { qd, qn, qm, size } => {
5252                let sz = mve_size_bits(size);
5253                let base: u32 = 0xFF000840 | (sz << 20);
5254                Ok(vfp_to_thumb_bytes(encode_mve_3reg(base, qd, qn, qm)))
5255            }
5256            ArmOp::MveMulI { qd, qn, qm, size } => {
5257                let sz = mve_size_bits(size);
5258                let base: u32 = 0xEF000950 | (sz << 20);
5259                Ok(vfp_to_thumb_bytes(encode_mve_3reg(base, qd, qn, qm)))
5260            }
5261            ArmOp::MveNegI { qd, qm, size } => {
5262                let sz = mve_size_bits(size);
5263                // VNEG.Sx Qd, Qm
5264                let qd_enc = qreg_to_num(qd);
5265                let qm_enc = qreg_to_num(qm);
5266                let base: u32 = 0xFFB103C0 | (sz << 18);
5267                let instr = base | ((qd_enc * 2) << 12) | (qm_enc * 2);
5268                Ok(vfp_to_thumb_bytes(instr))
5269            }
5270            ArmOp::MveDup { qd, rn, size } => {
5271                let sz = mve_size_bits(size);
5272                let qd_enc = qreg_to_num(qd);
5273                let rn_bits = reg_to_bits(rn);
5274                // VDUP.sz Qd, Rn: EEA0 0B10 variant
5275                // size encoding: 00=32, 01=16, 10=8
5276                let be = match sz {
5277                    0 => 0b00u32, // 8-bit
5278                    1 => 0b01,    // 16-bit
5279                    _ => 0b00,    // 32-bit (default)
5280                };
5281                let instr: u32 = 0xEEA00B10 | ((qd_enc * 2) << 16) | (rn_bits << 12) | (be << 5);
5282                Ok(vfp_to_thumb_bytes(instr))
5283            }
5284            ArmOp::MveExtractLane { rd, qn, lane, size } => {
5285                let qn_enc = qreg_to_num(qn);
5286                let rd_bits = reg_to_bits(rd);
5287                // VMOV.sz Rd, Dn[x] — extract from Q-register lane
5288                // For 32-bit: VMOV Rd, Dn — where Dn is the appropriate D-register
5289                let d_reg = qn_enc * 2 + ((*lane as u32) >> 1);
5290                let lane_in_d = (*lane as u32) & 1;
5291                let _sz = mve_size_bits(size);
5292                // VMOV Rd, Dn[x]: EE10 0B10 for 32-bit
5293                let instr: u32 = 0xEE100B10 | (d_reg << 16) | (rd_bits << 12) | (lane_in_d << 21);
5294                Ok(vfp_to_thumb_bytes(instr))
5295            }
5296            ArmOp::MveInsertLane { qd, rn, lane, size } => {
5297                let qd_enc = qreg_to_num(qd);
5298                let rn_bits = reg_to_bits(rn);
5299                let d_reg = qd_enc * 2 + ((*lane as u32) >> 1);
5300                let lane_in_d = (*lane as u32) & 1;
5301                let _sz = mve_size_bits(size);
5302                // VMOV Dn[x], Rn: EE00 0B10 for 32-bit
5303                let instr: u32 = 0xEE000B10 | (d_reg << 16) | (rn_bits << 12) | (lane_in_d << 21);
5304                Ok(vfp_to_thumb_bytes(instr))
5305            }
5306
5307            // MVE float comparisons — emit VCMP + VPSEL sequence (simplified: just VCMP)
5308            ArmOp::MveCmpEqI { qd, qn, qm, size }
5309            | ArmOp::MveCmpNeI { qd, qn, qm, size }
5310            | ArmOp::MveCmpLtS { qd, qn, qm, size }
5311            | ArmOp::MveCmpLtU { qd, qn, qm, size }
5312            | ArmOp::MveCmpGtS { qd, qn, qm, size }
5313            | ArmOp::MveCmpGtU { qd, qn, qm, size }
5314            | ArmOp::MveCmpLeS { qd, qn, qm, size }
5315            | ArmOp::MveCmpLeU { qd, qn, qm, size }
5316            | ArmOp::MveCmpGeS { qd, qn, qm, size }
5317            | ArmOp::MveCmpGeU { qd, qn, qm, size } => {
5318                // Encode as VADD (placeholder encoding — real implementation
5319                // would use VCMP + VPSEL pair)
5320                let sz = mve_size_bits(size);
5321                let base: u32 = 0xEF000840 | (sz << 20);
5322                Ok(vfp_to_thumb_bytes(encode_mve_3reg(base, qd, qn, qm)))
5323            }
5324
5325            // f32x4 MVE arithmetic
5326            ArmOp::MveAddF32 { qd, qn, qm } => {
5327                // VADD.F32 Qd, Qn, Qm (MVE): 0xEF000D40
5328                Ok(vfp_to_thumb_bytes(encode_mve_3reg(0xEF000D40, qd, qn, qm)))
5329            }
5330            ArmOp::MveSubF32 { qd, qn, qm } => {
5331                // VSUB.F32 Qd, Qn, Qm (MVE): 0xEF200D40
5332                Ok(vfp_to_thumb_bytes(encode_mve_3reg(0xEF200D40, qd, qn, qm)))
5333            }
5334            ArmOp::MveMulF32 { qd, qn, qm } => {
5335                // VMUL.F32 Qd, Qn, Qm (MVE): 0xFF000D50
5336                Ok(vfp_to_thumb_bytes(encode_mve_3reg(0xFF000D50, qd, qn, qm)))
5337            }
5338            ArmOp::MveNegF32 { qd, qm } => {
5339                let qd_enc = qreg_to_num(qd);
5340                let qm_enc = qreg_to_num(qm);
5341                // VNEG.F32 Qd, Qm: FFB907C0
5342                let instr: u32 = 0xFFB907C0 | ((qd_enc * 2) << 12) | (qm_enc * 2);
5343                Ok(vfp_to_thumb_bytes(instr))
5344            }
5345            ArmOp::MveAbsF32 { qd, qm } => {
5346                let qd_enc = qreg_to_num(qd);
5347                let qm_enc = qreg_to_num(qm);
5348                // VABS.F32 Qd, Qm: FFB90740
5349                let instr: u32 = 0xFFB90740 | ((qd_enc * 2) << 12) | (qm_enc * 2);
5350                Ok(vfp_to_thumb_bytes(instr))
5351            }
5352            ArmOp::MveCmpEqF32 { qd, qn, qm }
5353            | ArmOp::MveCmpNeF32 { qd, qn, qm }
5354            | ArmOp::MveCmpLtF32 { qd, qn, qm }
5355            | ArmOp::MveCmpLeF32 { qd, qn, qm }
5356            | ArmOp::MveCmpGtF32 { qd, qn, qm }
5357            | ArmOp::MveCmpGeF32 { qd, qn, qm } => {
5358                // Placeholder: encode as VADD.F32 (real impl needs VCMP.F32 + VPSEL)
5359                Ok(vfp_to_thumb_bytes(encode_mve_3reg(0xEF000D40, qd, qn, qm)))
5360            }
5361            ArmOp::MveDupF32 { qd, rn } => {
5362                let qd_enc = qreg_to_num(qd);
5363                let rn_bits = reg_to_bits(rn);
5364                // VDUP.32 Qd, Rn (same encoding as integer VDUP.32)
5365                let instr: u32 = 0xEEA00B10 | ((qd_enc * 2) << 16) | (rn_bits << 12);
5366                Ok(vfp_to_thumb_bytes(instr))
5367            }
5368            ArmOp::MveExtractLaneF32 { rd, qn, lane } => {
5369                let qn_enc = qreg_to_num(qn);
5370                let rd_bits = reg_to_bits(rd);
5371                // VMOV Rd, Sn where Sn = Q*4 + lane
5372                let s_num = qn_enc * 4 + (*lane as u32);
5373                let (vn, n) = encode_sreg(s_num);
5374                let instr: u32 = 0xEE100A10 | (vn << 16) | (rd_bits << 12) | (n << 7);
5375                Ok(vfp_to_thumb_bytes(instr))
5376            }
5377            ArmOp::MveReplaceLaneF32 { qd, rn, lane } => {
5378                let qd_enc = qreg_to_num(qd);
5379                let rn_bits = reg_to_bits(rn);
5380                // VMOV Sn, Rn where Sn = Q*4 + lane
5381                let s_num = qd_enc * 4 + (*lane as u32);
5382                let (vn, n) = encode_sreg(s_num);
5383                let instr: u32 = 0xEE000A10 | (vn << 16) | (rn_bits << 12) | (n << 7);
5384                Ok(vfp_to_thumb_bytes(instr))
5385            }
5386            ArmOp::MveDivF32 { qd, qn, qm } => {
5387                // Lane-wise: extract 4 S-regs, VDIV, insert back
5388                self.encode_thumb_mve_lane_wise_f32_binop(qd, qn, qm, 0xEE800A00)
5389            }
5390            ArmOp::MveSqrtF32 { qd, qm } => {
5391                // Lane-wise: extract 4 S-regs, VSQRT, insert back
5392                self.encode_thumb_mve_lane_wise_f32_sqrt(qd, qm)
5393            }
5394
5395            // Catch-all for any remaining ops
5396            _ => {
5397                let instr: u16 = 0xBF00; // NOP
5398                Ok(instr.to_le_bytes().to_vec())
5399            }
5400        }
5401    }
5402
5403    // === Thumb-2 VFP multi-instruction helpers ===
5404
5405    /// Encode F32 comparison as Thumb-2: VCMP.F32 + VMRS + MOVS rd,#0 + IT + MOV rd,#1
5406    fn encode_thumb_f32_compare(
5407        &self,
5408        rd: &Reg,
5409        sn: &VfpReg,
5410        sm: &VfpReg,
5411        cond_code: u32,
5412    ) -> Result<Vec<u8>> {
5413        let mut bytes = Vec::new();
5414        let rd_bits = reg_to_bits(rd);
5415
5416        // VCMP.F32 Sn, Sm
5417        let sn_num = vfp_sreg_to_num(sn)?;
5418        let sm_num = vfp_sreg_to_num(sm)?;
5419        let (vd, d) = encode_sreg(sn_num);
5420        let (vm, m) = encode_sreg(sm_num);
5421        let vcmp = 0xEEB40A40 | (d << 22) | (vd << 12) | (m << 5) | vm;
5422        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcmp));
5423
5424        // VMRS APSR_nzcv, FPSCR: 0xEEF1FA10
5425        bytes.extend_from_slice(&vfp_to_thumb_bytes(0xEEF1FA10));
5426
5427        // MOVS Rd, #0 (16-bit): 0010 0 Rd(3) 0000 0000
5428        if rd_bits < 8 {
5429            let movs_zero: u16 = 0x2000 | ((rd_bits as u16) << 8);
5430            bytes.extend_from_slice(&movs_zero.to_le_bytes());
5431        } else {
5432            // MOV.W Rd, #0 (32-bit Thumb-2)
5433            let hw1: u16 = 0xF04F;
5434            let hw2: u16 = (rd_bits as u16) << 8;
5435            bytes.extend_from_slice(&hw1.to_le_bytes());
5436            bytes.extend_from_slice(&hw2.to_le_bytes());
5437        }
5438
5439        // IT<cond> — If-Then for conditional MOV
5440        // IT encoding: 1011 1111 cond(4) mask(4)
5441        // mask = 0x8 for single "then" (IT)
5442        let it: u16 = 0xBF00 | ((cond_code as u16) << 4) | 0x8;
5443        bytes.extend_from_slice(&it.to_le_bytes());
5444
5445        // MOV Rd, #1 (16-bit, conditional due to IT): 0010 0 Rd(3) 0000 0001
5446        if rd_bits < 8 {
5447            let mov_one: u16 = 0x2001 | ((rd_bits as u16) << 8);
5448            bytes.extend_from_slice(&mov_one.to_le_bytes());
5449        } else {
5450            // MOV.W Rd, #1 (32-bit)
5451            let hw1: u16 = 0xF04F;
5452            let hw2: u16 = ((rd_bits as u16) << 8) | 0x01;
5453            bytes.extend_from_slice(&hw1.to_le_bytes());
5454            bytes.extend_from_slice(&hw2.to_le_bytes());
5455        }
5456
5457        Ok(bytes)
5458    }
5459
5460    /// Encode F32 constant load as Thumb-2: MOVW + MOVT + VMOV
5461    fn encode_thumb_f32_const(&self, sd: &VfpReg, value: f32) -> Result<Vec<u8>> {
5462        let mut bytes = Vec::new();
5463        let bits = value.to_bits();
5464        let rt: u32 = 12; // R12/IP as temp
5465
5466        // MOVW R12, #lo16
5467        // Thumb-2 MOVW: 11110 i 10 0100 imm4 | 0 imm3 Rd imm8
5468        let lo16 = bits & 0xFFFF;
5469        let imm4 = (lo16 >> 12) & 0xF;
5470        let i_bit = (lo16 >> 11) & 1;
5471        let imm3 = (lo16 >> 8) & 0x7;
5472        let imm8 = lo16 & 0xFF;
5473        let hw1: u16 = (0xF240 | (i_bit << 10) | imm4) as u16;
5474        let hw2: u16 = ((imm3 << 12) | (rt << 8) | imm8) as u16;
5475        bytes.extend_from_slice(&hw1.to_le_bytes());
5476        bytes.extend_from_slice(&hw2.to_le_bytes());
5477
5478        // MOVT R12, #hi16
5479        let hi16 = (bits >> 16) & 0xFFFF;
5480        let imm4 = (hi16 >> 12) & 0xF;
5481        let i_bit = (hi16 >> 11) & 1;
5482        let imm3 = (hi16 >> 8) & 0x7;
5483        let imm8 = hi16 & 0xFF;
5484        let hw1: u16 = (0xF2C0 | (i_bit << 10) | imm4) as u16;
5485        let hw2: u16 = ((imm3 << 12) | (rt << 8) | imm8) as u16;
5486        bytes.extend_from_slice(&hw1.to_le_bytes());
5487        bytes.extend_from_slice(&hw2.to_le_bytes());
5488
5489        // VMOV Sd, R12
5490        let vmov = encode_vmov_core_sreg(true, sd, &Reg::R12)?;
5491        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov));
5492
5493        Ok(bytes)
5494    }
5495
5496    /// Encode VMOV + VCVT.F32.xS32 as Thumb-2
5497    fn encode_thumb_f32_convert_i32(&self, sd: &VfpReg, rm: &Reg, signed: bool) -> Result<Vec<u8>> {
5498        let mut bytes = Vec::new();
5499
5500        // VMOV Sd, Rm
5501        let vmov = encode_vmov_core_sreg(true, sd, rm)?;
5502        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov));
5503
5504        // VCVT.F32.S32/U32 Sd, Sd
5505        let sd_num = vfp_sreg_to_num(sd)?;
5506        let (vd, d) = encode_sreg(sd_num);
5507        let (vm, m) = encode_sreg(sd_num);
5508        let base = if signed { 0xEEB80A40 } else { 0xEEB80AC0 };
5509        let vcvt = base | (d << 22) | (vd << 12) | (m << 5) | vm;
5510        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt));
5511
5512        Ok(bytes)
5513    }
5514
5515    /// Encode F32 rounding pseudo-op as Thumb-2 via VCVT to integer and back
5516    /// Encode F32 rounding as Thumb-2.
5517    /// `mode`: FPSCR RMode — 0b00=nearest, 0b01=+inf(ceil), 0b10=-inf(floor), 0b11=zero(trunc)
5518    ///
5519    /// For trunc: uses VCVTR.S32.F32 (always truncates).
5520    /// For ceil/floor/nearest: sets FPSCR rounding mode, uses VCVT.S32.F32 (non-R variant),
5521    /// then restores FPSCR.
5522    fn encode_thumb_f32_rounding(&self, sd: &VfpReg, sm: &VfpReg, mode: u8) -> Result<Vec<u8>> {
5523        let mut bytes = Vec::new();
5524        let sm_num = vfp_sreg_to_num(sm)?;
5525        let sd_num = vfp_sreg_to_num(sd)?;
5526        let (vd_s, d_s) = encode_sreg(sd_num);
5527        let (vm_s, m_s) = encode_sreg(sm_num);
5528
5529        if mode == 0b11 {
5530            // Trunc (toward zero): VCVTR.S32.F32 — bit[7]=1, always truncates
5531            let vcvt_to_int = 0xEEBD0AC0 | (d_s << 22) | (vd_s << 12) | (m_s << 5) | vm_s;
5532            bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_int));
5533        } else {
5534            // ceil/floor/nearest: manipulate FPSCR rounding mode
5535            let rt: u32 = 12; // R12/IP as temp
5536
5537            // VMRS R12, FPSCR
5538            let vmrs = 0xEEF10A10 | (rt << 12);
5539            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmrs));
5540
5541            // BIC.W R12, R12, #(3 << 22) — clear RMode bits [23:22]
5542            // Thumb-2 modified immediate for 3<<22 = 0x00C00000:
5543            // BIC.W encoding: 11110 i 0 0001 S Rn | 0 imm3 Rd imm8
5544            // 0x00C00000 = 0x03 shifted left by 22 => Thumb mod-imm: i=0, imm3=0b101, imm8=0x03
5545            let bic_hw1: u16 = 0xF020 | ((rt as u16) & 0xF); // BIC, Rn=R12
5546            let bic_hw2: u16 = (0x05 << 12) | ((rt as u16) << 8) | 0x03;
5547            bytes.extend_from_slice(&bic_hw1.to_le_bytes());
5548            bytes.extend_from_slice(&bic_hw2.to_le_bytes());
5549
5550            // ORR.W R12, R12, #(mode << 22)
5551            if mode != 0 {
5552                let orr_hw1: u16 = 0xF040 | ((rt as u16) & 0xF); // ORR, Rn=R12
5553                let orr_hw2: u16 = (0x05 << 12) | ((rt as u16) << 8) | (mode as u16);
5554                bytes.extend_from_slice(&orr_hw1.to_le_bytes());
5555                bytes.extend_from_slice(&orr_hw2.to_le_bytes());
5556            }
5557
5558            // VMSR FPSCR, R12
5559            let vmsr = 0xEEE10A10 | (rt << 12);
5560            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmsr));
5561
5562            // VCVT.S32.F32 Sd, Sm — non-R variant (bit[7]=0), uses FPSCR rmode
5563            let vcvt_to_int = 0xEEBD0A40 | (d_s << 22) | (vd_s << 12) | (m_s << 5) | vm_s;
5564            bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_int));
5565
5566            // Restore FPSCR: clear rmode bits back to nearest (default)
5567            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmrs));
5568            bytes.extend_from_slice(&bic_hw1.to_le_bytes());
5569            bytes.extend_from_slice(&bic_hw2.to_le_bytes());
5570            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmsr));
5571        }
5572
5573        // VCVT.F32.S32 Sd, Sd (convert integer result back to float)
5574        let (vd2, d2) = encode_sreg(sd_num);
5575        let vcvt_to_float = 0xEEB80A40 | (d2 << 22) | (vd2 << 12) | (d_s << 5) | vd_s;
5576        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_float));
5577
5578        Ok(bytes)
5579    }
5580
5581    /// Encode F32 min/max as Thumb-2: VMOV + VCMP + VMRS + IT + VMOV
5582    fn encode_thumb_f32_minmax(
5583        &self,
5584        sd: &VfpReg,
5585        sn: &VfpReg,
5586        sm: &VfpReg,
5587        is_min: bool,
5588    ) -> Result<Vec<u8>> {
5589        let mut bytes = Vec::new();
5590        let sn_num = vfp_sreg_to_num(sn)?;
5591        let sm_num = vfp_sreg_to_num(sm)?;
5592        let sd_num = vfp_sreg_to_num(sd)?;
5593
5594        // VMOV.F32 Sd, Sn
5595        let (vd, d) = encode_sreg(sd_num);
5596        let (vn, n) = encode_sreg(sn_num);
5597        let vmov_sn = 0xEEB00A40 | (d << 22) | (vd << 12) | (n << 5) | vn;
5598        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov_sn));
5599
5600        // VCMP.F32 Sn, Sm
5601        let (vm, m) = encode_sreg(sm_num);
5602        let vcmp = 0xEEB40A40 | (n << 22) | (vn << 12) | (m << 5) | vm;
5603        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcmp));
5604
5605        // VMRS APSR_nzcv, FPSCR
5606        bytes.extend_from_slice(&vfp_to_thumb_bytes(0xEEF1FA10));
5607
5608        // IT GT (for min) or IT MI (for max)
5609        let cond: u16 = if is_min { 0xC } else { 0x4 };
5610        let it: u16 = 0xBF00 | (cond << 4) | 0x8;
5611        bytes.extend_from_slice(&it.to_le_bytes());
5612
5613        // VMOV{cond}.F32 Sd, Sm — conditional VMOV in IT block
5614        let vmov_sm = 0xEEB00A40 | (d << 22) | (vd << 12) | (m << 5) | vm;
5615        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov_sm));
5616
5617        Ok(bytes)
5618    }
5619
5620    /// Encode F32 copysign as Thumb-2
5621    fn encode_thumb_f32_copysign(&self, sd: &VfpReg, sn: &VfpReg, sm: &VfpReg) -> Result<Vec<u8>> {
5622        let mut bytes = Vec::new();
5623
5624        // VMOV R12, Sm (get sign source bits)
5625        bytes.extend_from_slice(&vfp_to_thumb_bytes(encode_vmov_core_sreg(
5626            false,
5627            sm,
5628            &Reg::R12,
5629        )?));
5630
5631        // VMOV R0, Sn (get magnitude source bits)
5632        bytes.extend_from_slice(&vfp_to_thumb_bytes(encode_vmov_core_sreg(
5633            false,
5634            sn,
5635            &Reg::R0,
5636        )?));
5637
5638        // AND.W R12, R12, #0x80000000
5639        // Thumb-2 modified immediate: 0x80000000 = constant 0x80 with rotation
5640        // Using T1 encoding: 11110 i 0 0000 S Rn | 0 imm3 Rd imm8
5641        // 0x80000000: i=0, imm3=0b001, imm8=0x00 (rotation=4, value=0x80)
5642        // Actually encoding #0x80000000 as modified constant:
5643        // bit pattern 1 followed by 31 zeros: enc = 0b0100_00000000 = 0x0100? No.
5644        // ARM modified immediate: abcdefgh rotated. 0x80000000 = 0x80 ROR 2 = enc 0x0102
5645        // Actually: value = abcdefgh ROR (2*rot). 0x80 = 10000000, ROR 2 gives 0x20000000.
5646        // For 0x80000000: 0x02 ROR 2 = 0x80000000. So imm12 = (1<<8) | 0x02 = 0x102
5647        let hw1: u16 = 0xF000 | 12; // AND.W R12, R12, #modified_const (i=0, Rn=R12)
5648        let hw2: u16 = (0x1 << 12) | (12 << 8) | 0x02; // imm3=1, Rd=R12, imm8=0x02
5649        bytes.extend_from_slice(&hw1.to_le_bytes());
5650        bytes.extend_from_slice(&hw2.to_le_bytes());
5651
5652        // BIC.W R0, R0, #0x80000000 (R0 = register 0, fields are zero)
5653        let hw1: u16 = 0xF020; // BIC.W R0, R0, #modified_const (i=0, Rn=R0)
5654        let hw2: u16 = (0x1 << 12) | 0x02; // imm3=1, Rd=R0, imm8=0x02
5655        bytes.extend_from_slice(&hw1.to_le_bytes());
5656        bytes.extend_from_slice(&hw2.to_le_bytes());
5657
5658        // ORR.W R0, R0, R12 (R0 = register 0)
5659        let hw1: u16 = 0xEA40; // ORR.W R0, R0, R12 (Rn=R0)
5660        let hw2: u16 = 12; // Rd=R0, Rm=R12
5661        bytes.extend_from_slice(&hw1.to_le_bytes());
5662        bytes.extend_from_slice(&hw2.to_le_bytes());
5663
5664        // VMOV Sd, R0
5665        bytes.extend_from_slice(&vfp_to_thumb_bytes(encode_vmov_core_sreg(
5666            true,
5667            sd,
5668            &Reg::R0,
5669        )?));
5670
5671        Ok(bytes)
5672    }
5673
5674    /// Encode F64 comparison as Thumb-2: VCMP.F64 + VMRS + MOV #0 + IT + MOV #1
5675    fn encode_thumb_f64_compare(
5676        &self,
5677        rd: &Reg,
5678        dn: &VfpReg,
5679        dm: &VfpReg,
5680        cond_code: u32,
5681    ) -> Result<Vec<u8>> {
5682        let mut bytes = Vec::new();
5683        let rd_bits = reg_to_bits(rd);
5684
5685        // VCMP.F64 Dn, Dm
5686        let dn_num = vfp_dreg_to_num(dn)?;
5687        let dm_num = vfp_dreg_to_num(dm)?;
5688        let (vd, d) = encode_dreg(dn_num);
5689        let (vm, m) = encode_dreg(dm_num);
5690        let vcmp = 0xEEB40B40 | (d << 22) | (vd << 12) | (m << 5) | vm;
5691        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcmp));
5692
5693        // VMRS APSR_nzcv, FPSCR
5694        bytes.extend_from_slice(&vfp_to_thumb_bytes(0xEEF1FA10));
5695
5696        // MOVS Rd, #0
5697        if rd_bits < 8 {
5698            let movs_zero: u16 = 0x2000 | ((rd_bits as u16) << 8);
5699            bytes.extend_from_slice(&movs_zero.to_le_bytes());
5700        } else {
5701            let hw1: u16 = 0xF04F;
5702            let hw2: u16 = (rd_bits as u16) << 8;
5703            bytes.extend_from_slice(&hw1.to_le_bytes());
5704            bytes.extend_from_slice(&hw2.to_le_bytes());
5705        }
5706
5707        // IT<cond>
5708        let it: u16 = 0xBF00 | ((cond_code as u16) << 4) | 0x8;
5709        bytes.extend_from_slice(&it.to_le_bytes());
5710
5711        // MOV Rd, #1
5712        if rd_bits < 8 {
5713            let mov_one: u16 = 0x2001 | ((rd_bits as u16) << 8);
5714            bytes.extend_from_slice(&mov_one.to_le_bytes());
5715        } else {
5716            let hw1: u16 = 0xF04F;
5717            let hw2: u16 = ((rd_bits as u16) << 8) | 0x01;
5718            bytes.extend_from_slice(&hw1.to_le_bytes());
5719            bytes.extend_from_slice(&hw2.to_le_bytes());
5720        }
5721
5722        Ok(bytes)
5723    }
5724
5725    /// Encode F64 constant load as Thumb-2: MOVW+MOVT (lo32 into R0) + MOVW+MOVT (hi32 into R12) + VMOV Dd, R0, R12
5726    fn encode_thumb_f64_const(&self, dd: &VfpReg, value: f64) -> Result<Vec<u8>> {
5727        let mut bytes = Vec::new();
5728        let bits = value.to_bits();
5729        let lo32 = bits as u32;
5730        let hi32 = (bits >> 32) as u32;
5731
5732        // MOVW R0, #lo16(lo32)
5733        let lo16 = lo32 & 0xFFFF;
5734        bytes.extend_from_slice(&self.encode_thumb32_movw_raw(0, lo16)?);
5735
5736        // MOVT R0, #hi16(lo32)
5737        let hi16 = (lo32 >> 16) & 0xFFFF;
5738        bytes.extend_from_slice(&self.encode_thumb32_movt_raw(0, hi16)?);
5739
5740        // MOVW R12, #lo16(hi32)
5741        let lo16 = hi32 & 0xFFFF;
5742        bytes.extend_from_slice(&self.encode_thumb32_movw_raw(12, lo16)?);
5743
5744        // MOVT R12, #hi16(hi32)
5745        let hi16 = (hi32 >> 16) & 0xFFFF;
5746        bytes.extend_from_slice(&self.encode_thumb32_movt_raw(12, hi16)?);
5747
5748        // VMOV Dd, R0, R12
5749        let vmov = encode_vmov_core_dreg(true, dd, &Reg::R0, &Reg::R12)?;
5750        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov));
5751
5752        Ok(bytes)
5753    }
5754
5755    /// Encode VMOV Sd, Rm + VCVT.F64.S32/U32 Dd, Sd as Thumb-2
5756    fn encode_thumb_f64_convert_i32(&self, dd: &VfpReg, rm: &Reg, signed: bool) -> Result<Vec<u8>> {
5757        let mut bytes = Vec::new();
5758
5759        // VMOV S0, Rm
5760        let vmov = encode_vmov_core_sreg(true, &VfpReg::S0, rm)?;
5761        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov));
5762
5763        // VCVT.F64.S32 Dd, S0 or VCVT.F64.U32 Dd, S0
5764        let dd_num = vfp_dreg_to_num(dd)?;
5765        let (vd, d) = encode_dreg(dd_num);
5766        let base = if signed { 0xEEB80B40 } else { 0xEEB80BC0 };
5767        let vcvt = base | (d << 22) | (vd << 12);
5768        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt));
5769
5770        Ok(bytes)
5771    }
5772
5773    /// Encode VCVT.F64.F32 Dd, Sm as Thumb-2
5774    fn encode_thumb_f64_promote_f32(&self, dd: &VfpReg, sm: &VfpReg) -> Result<Vec<u8>> {
5775        let dd_num = vfp_dreg_to_num(dd)?;
5776        let sm_num = vfp_sreg_to_num(sm)?;
5777        let (vd, d) = encode_dreg(dd_num);
5778        let (vm, m) = encode_sreg(sm_num);
5779
5780        let vcvt = 0xEEB70AC0 | (d << 22) | (vd << 12) | (m << 5) | vm;
5781        Ok(vfp_to_thumb_bytes(vcvt))
5782    }
5783
5784    /// Encode VCVT.S32/U32.F64 S0, Dm + VMOV Rd, S0 as Thumb-2
5785    fn encode_thumb_i32_trunc_f64(&self, rd: &Reg, dm: &VfpReg, signed: bool) -> Result<Vec<u8>> {
5786        let mut bytes = Vec::new();
5787        let dm_num = vfp_dreg_to_num(dm)?;
5788        let (vm, m) = encode_dreg(dm_num);
5789
5790        // VCVT.S32.F64 S0, Dm or VCVT.U32.F64 S0, Dm
5791        let base = if signed { 0xEEBD0BC0 } else { 0xEEBC0BC0 };
5792        let vcvt = base | (m << 5) | vm;
5793        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt));
5794
5795        // VMOV Rd, S0
5796        let vmov = encode_vmov_core_sreg(false, &VfpReg::S0, rd)?;
5797        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov));
5798
5799        Ok(bytes)
5800    }
5801
5802    /// Encode F64 rounding pseudo-op as Thumb-2 via VCVT to integer and back
5803    /// Encode F64 rounding as Thumb-2.
5804    /// `mode`: FPSCR RMode — 0b00=nearest, 0b01=+inf(ceil), 0b10=-inf(floor), 0b11=zero(trunc)
5805    fn encode_thumb_f64_rounding(&self, dd: &VfpReg, dm: &VfpReg, mode: u8) -> Result<Vec<u8>> {
5806        let mut bytes = Vec::new();
5807        let dm_num = vfp_dreg_to_num(dm)?;
5808        let dd_num = vfp_dreg_to_num(dd)?;
5809        let (vm, m) = encode_dreg(dm_num);
5810        let (vd, d) = encode_dreg(dd_num);
5811
5812        if mode == 0b11 {
5813            // Trunc: VCVTR.S32.F64 — bit[7]=1, always truncates
5814            let vcvt_to_int = 0xEEBD0BC0 | (m << 5) | vm;
5815            bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_int));
5816        } else {
5817            let rt: u32 = 12;
5818
5819            // VMRS R12, FPSCR
5820            let vmrs = 0xEEF10A10 | (rt << 12);
5821            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmrs));
5822
5823            // BIC.W R12, R12, #(3 << 22)
5824            let bic_hw1: u16 = 0xF020 | ((rt as u16) & 0xF);
5825            let bic_hw2: u16 = (0x05 << 12) | ((rt as u16) << 8) | 0x03;
5826            bytes.extend_from_slice(&bic_hw1.to_le_bytes());
5827            bytes.extend_from_slice(&bic_hw2.to_le_bytes());
5828
5829            // ORR.W R12, R12, #(mode << 22)
5830            if mode != 0 {
5831                let orr_hw1: u16 = 0xF040 | ((rt as u16) & 0xF);
5832                let orr_hw2: u16 = (0x05 << 12) | ((rt as u16) << 8) | (mode as u16);
5833                bytes.extend_from_slice(&orr_hw1.to_le_bytes());
5834                bytes.extend_from_slice(&orr_hw2.to_le_bytes());
5835            }
5836
5837            // VMSR FPSCR, R12
5838            let vmsr = 0xEEE10A10 | (rt << 12);
5839            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmsr));
5840
5841            // VCVT.S32.F64 S0, Dm — non-R variant (bit[7]=0)
5842            let vcvt_to_int = 0xEEBD0B40 | (m << 5) | vm;
5843            bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_int));
5844
5845            // Restore FPSCR
5846            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmrs));
5847            bytes.extend_from_slice(&bic_hw1.to_le_bytes());
5848            bytes.extend_from_slice(&bic_hw2.to_le_bytes());
5849            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmsr));
5850        }
5851
5852        // VCVT.F64.S32 Dd, S0
5853        let vcvt_to_float = 0xEEB80B40 | (d << 22) | (vd << 12);
5854        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_float));
5855
5856        Ok(bytes)
5857    }
5858
5859    /// Encode F64 min/max as Thumb-2
5860    fn encode_thumb_f64_minmax(
5861        &self,
5862        dd: &VfpReg,
5863        dn: &VfpReg,
5864        dm: &VfpReg,
5865        is_min: bool,
5866    ) -> Result<Vec<u8>> {
5867        let mut bytes = Vec::new();
5868        let dn_num = vfp_dreg_to_num(dn)?;
5869        let dm_num = vfp_dreg_to_num(dm)?;
5870        let dd_num = vfp_dreg_to_num(dd)?;
5871
5872        // VMOV.F64 Dd, Dn
5873        let (vd, d) = encode_dreg(dd_num);
5874        let (vn, n) = encode_dreg(dn_num);
5875        let vmov_dn = 0xEEB00B40 | (d << 22) | (vd << 12) | (n << 5) | vn;
5876        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov_dn));
5877
5878        // VCMP.F64 Dn, Dm
5879        let (vm, m) = encode_dreg(dm_num);
5880        let vcmp = 0xEEB40B40 | (n << 22) | (vn << 12) | (m << 5) | vm;
5881        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcmp));
5882
5883        // VMRS APSR_nzcv, FPSCR
5884        bytes.extend_from_slice(&vfp_to_thumb_bytes(0xEEF1FA10));
5885
5886        // IT GT (for min) or IT MI (for max)
5887        let cond: u16 = if is_min { 0xC } else { 0x4 };
5888        let it: u16 = 0xBF00 | (cond << 4) | 0x8;
5889        bytes.extend_from_slice(&it.to_le_bytes());
5890
5891        // VMOV{cond}.F64 Dd, Dm
5892        let vmov_dm = 0xEEB00B40 | (d << 22) | (vd << 12) | (m << 5) | vm;
5893        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov_dm));
5894
5895        Ok(bytes)
5896    }
5897
5898    /// Encode F64 copysign as Thumb-2
5899    fn encode_thumb_f64_copysign(&self, dd: &VfpReg, dn: &VfpReg, dm: &VfpReg) -> Result<Vec<u8>> {
5900        let mut bytes = Vec::new();
5901
5902        // VMOV R0, R12, Dm (get sign source)
5903        bytes.extend_from_slice(&vfp_to_thumb_bytes(encode_vmov_core_dreg(
5904            false,
5905            dm,
5906            &Reg::R0,
5907            &Reg::R12,
5908        )?));
5909
5910        // VMOV R1, R2, Dn (get magnitude source)
5911        bytes.extend_from_slice(&vfp_to_thumb_bytes(encode_vmov_core_dreg(
5912            false,
5913            dn,
5914            &Reg::R1,
5915            &Reg::R2,
5916        )?));
5917
5918        // AND.W R12, R12, #0x80000000 (i=0, Rn=R12)
5919        let hw1: u16 = 0xF000 | 12;
5920        let hw2: u16 = (0x1 << 12) | (12 << 8) | 0x02;
5921        bytes.extend_from_slice(&hw1.to_le_bytes());
5922        bytes.extend_from_slice(&hw2.to_le_bytes());
5923
5924        // BIC.W R2, R2, #0x80000000 (i=0, Rn=R2)
5925        let hw1: u16 = 0xF020 | 2;
5926        let hw2: u16 = (0x1 << 12) | (2 << 8) | 0x02;
5927        bytes.extend_from_slice(&hw1.to_le_bytes());
5928        bytes.extend_from_slice(&hw2.to_le_bytes());
5929
5930        // ORR.W R2, R2, R12
5931        let hw1: u16 = 0xEA40 | 2;
5932        let hw2: u16 = (2 << 8) | 12;
5933        bytes.extend_from_slice(&hw1.to_le_bytes());
5934        bytes.extend_from_slice(&hw2.to_le_bytes());
5935
5936        // VMOV Dd, R1, R2
5937        bytes.extend_from_slice(&vfp_to_thumb_bytes(encode_vmov_core_dreg(
5938            true,
5939            dd,
5940            &Reg::R1,
5941            &Reg::R2,
5942        )?));
5943
5944        Ok(bytes)
5945    }
5946
5947    /// Encode VCVT.S32/U32.F32 + VMOV as Thumb-2
5948    fn encode_thumb_i32_trunc_f32(&self, rd: &Reg, sm: &VfpReg, signed: bool) -> Result<Vec<u8>> {
5949        let mut bytes = Vec::new();
5950
5951        let sm_num = vfp_sreg_to_num(sm)?;
5952        let (vd, d) = encode_sreg(sm_num);
5953        let (vm, m) = encode_sreg(sm_num);
5954        let base = if signed { 0xEEBD0AC0 } else { 0xEEBC0AC0 };
5955        let vcvt = base | (d << 22) | (vd << 12) | (m << 5) | vm;
5956        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt));
5957
5958        // VMOV Rd, Sm
5959        let vmov = encode_vmov_core_sreg(false, sm, rd)?;
5960        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov));
5961
5962        Ok(bytes)
5963    }
5964
5965    // === Thumb-2 32-bit encoding helpers ===
5966
5967    /// Encode Thumb-2 32-bit ADD with immediate
5968    fn encode_thumb32_add(&self, rd: &Reg, rn: &Reg, imm: u32) -> Result<Vec<u8>> {
5969        let rd_bits = reg_to_bits(rd);
5970        let rn_bits = reg_to_bits(rn);
5971
5972        // ADD.W Rd, Rn, #imm12
5973        // First halfword: 1111 0 i 0 1000 S Rn
5974        // Second halfword: 0 imm3 Rd imm8
5975        let i_bit = (imm >> 11) & 1;
5976        let imm3 = (imm >> 8) & 0x7;
5977        let imm8 = imm & 0xFF;
5978
5979        let hw1: u16 = (0xF100 | (i_bit << 10) | rn_bits) as u16;
5980        let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
5981
5982        let mut bytes = hw1.to_le_bytes().to_vec();
5983        bytes.extend_from_slice(&hw2.to_le_bytes());
5984        Ok(bytes)
5985    }
5986
5987    /// Encode Thumb-2 32-bit SUB with immediate
5988    fn encode_thumb32_sub(&self, rd: &Reg, rn: &Reg, imm: u32) -> Result<Vec<u8>> {
5989        let rd_bits = reg_to_bits(rd);
5990        let rn_bits = reg_to_bits(rn);
5991
5992        let i_bit = (imm >> 11) & 1;
5993        let imm3 = (imm >> 8) & 0x7;
5994        let imm8 = imm & 0xFF;
5995
5996        let hw1: u16 = (0xF1A0 | (i_bit << 10) | rn_bits) as u16;
5997        let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
5998
5999        let mut bytes = hw1.to_le_bytes().to_vec();
6000        bytes.extend_from_slice(&hw2.to_le_bytes());
6001        Ok(bytes)
6002    }
6003
6004    /// Encode Thumb-2 32-bit ADDS with immediate (sets flags)
6005    fn encode_thumb32_adds(&self, rd: &Reg, rn: &Reg, imm: u32) -> Result<Vec<u8>> {
6006        let rd_bits = reg_to_bits(rd);
6007        let rn_bits = reg_to_bits(rn);
6008
6009        let i_bit = (imm >> 11) & 1;
6010        let imm3 = (imm >> 8) & 0x7;
6011        let imm8 = imm & 0xFF;
6012
6013        // ADDS.W Rd, Rn, #imm (with S=1)
6014        // First halfword: 1111 0 i 0 1000 1 Rn = F110 | i<<10 | Rn
6015        let hw1: u16 = (0xF110 | (i_bit << 10) | rn_bits) as u16;
6016        let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
6017
6018        let mut bytes = hw1.to_le_bytes().to_vec();
6019        bytes.extend_from_slice(&hw2.to_le_bytes());
6020        Ok(bytes)
6021    }
6022
6023    /// Encode Thumb-2 32-bit SUBS with immediate (sets flags)
6024    fn encode_thumb32_subs(&self, rd: &Reg, rn: &Reg, imm: u32) -> Result<Vec<u8>> {
6025        let rd_bits = reg_to_bits(rd);
6026        let rn_bits = reg_to_bits(rn);
6027
6028        let i_bit = (imm >> 11) & 1;
6029        let imm3 = (imm >> 8) & 0x7;
6030        let imm8 = imm & 0xFF;
6031
6032        // SUBS.W Rd, Rn, #imm (with S=1)
6033        // First halfword: 1111 0 i 0 1101 1 Rn = F1B0 | i<<10 | Rn
6034        let hw1: u16 = (0xF1B0 | (i_bit << 10) | rn_bits) as u16;
6035        let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
6036
6037        let mut bytes = hw1.to_le_bytes().to_vec();
6038        bytes.extend_from_slice(&hw2.to_le_bytes());
6039        Ok(bytes)
6040    }
6041
6042    /// Encode Thumb-2 32-bit MOVW (16-bit immediate)
6043    ///
6044    /// # Contract (Verus-style)
6045    /// ```text
6046    /// requires rd <= R14
6047    /// ensures result.len() == 4
6048    /// ensures (imm & 0xFFFF) can be reconstructed from the encoding
6049    /// ```
6050    fn encode_thumb32_movw(&self, rd: &Reg, imm: u32) -> Result<Vec<u8>> {
6051        let rd_bits = reg_to_bits(rd);
6052        reg_bits_checked(rd_bits)?;
6053        let imm16 = imm & 0xFFFF;
6054
6055        // MOVW Rd, #imm16
6056        // 1111 0 i 10 0 1 0 0 imm4 | 0 imm3 Rd imm8
6057        let imm4 = (imm16 >> 12) & 0xF;
6058        let i_bit = (imm16 >> 11) & 1;
6059        let imm3 = (imm16 >> 8) & 0x7;
6060        let imm8 = imm16 & 0xFF;
6061
6062        let hw1: u16 = (0xF240 | (i_bit << 10) | imm4) as u16;
6063        let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
6064
6065        let mut bytes = hw1.to_le_bytes().to_vec();
6066        bytes.extend_from_slice(&hw2.to_le_bytes());
6067        encoding_contracts::verify_thumb32(&bytes);
6068        Ok(bytes)
6069    }
6070
6071    /// Encode Thumb-2 32-bit shift with immediate
6072    ///
6073    /// # Contract (Verus-style)
6074    /// ```text
6075    /// requires rd <= R14, rm <= R14
6076    /// ensures result.len() == 4
6077    /// ```
6078    fn encode_thumb32_shift(
6079        &self,
6080        rd: &Reg,
6081        rm: &Reg,
6082        shift: u32,
6083        shift_type: u8,
6084    ) -> Result<Vec<u8>> {
6085        let rd_bits = reg_to_bits(rd);
6086        let rm_bits = reg_to_bits(rm);
6087        reg_bits_checked(rd_bits)?;
6088        reg_bits_checked(rm_bits)?;
6089        let imm5 = shift & 0x1F;
6090        let imm2 = imm5 & 0x3;
6091        let imm3 = (imm5 >> 2) & 0x7;
6092
6093        // MOV.W Rd, Rm, <shift> #imm
6094        // EA4F 0 imm3 Rd imm2 type Rm
6095        let hw1: u16 = 0xEA4F;
6096        let hw2: u16 =
6097            ((imm3 << 12) | (rd_bits << 8) | (imm2 << 6) | ((shift_type as u32) << 4) | rm_bits)
6098                as u16;
6099
6100        let mut bytes = hw1.to_le_bytes().to_vec();
6101        bytes.extend_from_slice(&hw2.to_le_bytes());
6102        Ok(bytes)
6103    }
6104
6105    /// Encode Thumb-2 32-bit shift by register
6106    /// Encoding: 11111010 0xx0 Rn | 1111 Rd 0000 Rm
6107    /// shift_type: 00=LSL, 01=LSR, 10=ASR, 11=ROR
6108    fn encode_thumb32_shift_reg(
6109        &self,
6110        rd: &Reg,
6111        rn: &Reg,
6112        rm: &Reg,
6113        shift_type: u8,
6114    ) -> Result<Vec<u8>> {
6115        let rd_bits = reg_to_bits(rd);
6116        let rn_bits = reg_to_bits(rn);
6117        let rm_bits = reg_to_bits(rm);
6118
6119        // hw1: 1111 1010 0xx0 Rn
6120        let hw1: u16 = (0xFA00 | ((shift_type as u32) << 5) | rn_bits) as u16;
6121        // hw2: 1111 Rd 0000 Rm
6122        let hw2: u16 = (0xF000 | (rd_bits << 8) | rm_bits) as u16;
6123
6124        let mut bytes = hw1.to_le_bytes().to_vec();
6125        bytes.extend_from_slice(&hw2.to_le_bytes());
6126        Ok(bytes)
6127    }
6128
6129    /// Encode Thumb-2 32-bit CMP with immediate
6130    fn encode_thumb32_cmp_imm(&self, rn: &Reg, imm: u32) -> Result<Vec<u8>> {
6131        let rn_bits = reg_to_bits(rn);
6132
6133        let i_bit = (imm >> 11) & 1;
6134        let imm3 = (imm >> 8) & 0x7;
6135        let imm8 = imm & 0xFF;
6136
6137        // CMP.W Rn, #imm
6138        let hw1: u16 = (0xF1B0 | (i_bit << 10) | rn_bits) as u16;
6139        let hw2: u16 = ((imm3 << 12) | 0x0F00 | imm8) as u16;
6140
6141        let mut bytes = hw1.to_le_bytes().to_vec();
6142        bytes.extend_from_slice(&hw2.to_le_bytes());
6143        Ok(bytes)
6144    }
6145
6146    /// Encode Thumb-2 32-bit LDR
6147    fn encode_thumb32_ldr(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6148        let rd_bits = reg_to_bits(rd);
6149        let base_bits = reg_to_bits(base);
6150
6151        // LDR.W Rd, [Rn, #imm12]
6152        let hw1: u16 = (0xF8D0 | base_bits) as u16;
6153        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6154
6155        let mut bytes = hw1.to_le_bytes().to_vec();
6156        bytes.extend_from_slice(&hw2.to_le_bytes());
6157        Ok(bytes)
6158    }
6159
6160    /// Encode Thumb-2 32-bit STR
6161    fn encode_thumb32_str(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6162        let rd_bits = reg_to_bits(rd);
6163        let base_bits = reg_to_bits(base);
6164
6165        // STR.W Rd, [Rn, #imm12]
6166        let hw1: u16 = (0xF8C0 | base_bits) as u16;
6167        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6168
6169        let mut bytes = hw1.to_le_bytes().to_vec();
6170        bytes.extend_from_slice(&hw2.to_le_bytes());
6171        Ok(bytes)
6172    }
6173
6174    /// Encode Thumb-2 32-bit LDR with register offset: LDR.W Rd, [Rn, Rm]
6175    fn encode_thumb32_ldr_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6176        let rd_bits = reg_to_bits(rd);
6177        let base_bits = reg_to_bits(base);
6178        let rm_bits = reg_to_bits(offset_reg);
6179
6180        // LDR.W Rd, [Rn, Rm, LSL #0]
6181        // Encoding: 1111 1000 0101 Rn | Rt 0000 00 imm2 Rm
6182        // imm2 = 00 for no shift (LSL #0)
6183        let hw1: u16 = (0xF850 | base_bits) as u16;
6184        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6185
6186        let mut bytes = hw1.to_le_bytes().to_vec();
6187        bytes.extend_from_slice(&hw2.to_le_bytes());
6188        Ok(bytes)
6189    }
6190
6191    /// Encode Thumb-2 32-bit STR with register offset: STR.W Rd, [Rn, Rm]
6192    fn encode_thumb32_str_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6193        let rd_bits = reg_to_bits(rd);
6194        let base_bits = reg_to_bits(base);
6195        let rm_bits = reg_to_bits(offset_reg);
6196
6197        // STR.W Rd, [Rn, Rm, LSL #0]
6198        // Encoding: 1111 1000 0100 Rn | Rt 0000 00 imm2 Rm
6199        // imm2 = 00 for no shift (LSL #0)
6200        let hw1: u16 = (0xF840 | base_bits) as u16;
6201        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6202
6203        let mut bytes = hw1.to_le_bytes().to_vec();
6204        bytes.extend_from_slice(&hw2.to_le_bytes());
6205        Ok(bytes)
6206    }
6207
6208    // === Sub-word load/store Thumb-2 encoding helpers ===
6209
6210    /// Encode Thumb-2 32-bit LDRB with immediate: LDRB.W Rd, [Rn, #imm12]
6211    fn encode_thumb32_ldrb_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6212        let rd_bits = reg_to_bits(rd);
6213        let base_bits = reg_to_bits(base);
6214        // LDRB.W Rd, [Rn, #imm12]: 1111 1000 1001 Rn | Rt imm12
6215        let hw1: u16 = (0xF890 | base_bits) as u16;
6216        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6217        let mut bytes = hw1.to_le_bytes().to_vec();
6218        bytes.extend_from_slice(&hw2.to_le_bytes());
6219        Ok(bytes)
6220    }
6221
6222    /// Encode Thumb-2 32-bit LDRB with register: LDRB.W Rd, [Rn, Rm]
6223    fn encode_thumb32_ldrb_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6224        let rd_bits = reg_to_bits(rd);
6225        let base_bits = reg_to_bits(base);
6226        let rm_bits = reg_to_bits(offset_reg);
6227        // LDRB.W Rd, [Rn, Rm, LSL #0]: 1111 1000 0001 Rn | Rt 0000 00 imm2 Rm
6228        let hw1: u16 = (0xF810 | base_bits) as u16;
6229        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6230        let mut bytes = hw1.to_le_bytes().to_vec();
6231        bytes.extend_from_slice(&hw2.to_le_bytes());
6232        Ok(bytes)
6233    }
6234
6235    /// Encode Thumb-2 32-bit LDRSB with immediate: LDRSB.W Rd, [Rn, #imm12]
6236    fn encode_thumb32_ldrsb_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6237        let rd_bits = reg_to_bits(rd);
6238        let base_bits = reg_to_bits(base);
6239        // LDRSB.W Rd, [Rn, #imm12]: 1111 1001 1001 Rn | Rt imm12
6240        let hw1: u16 = (0xF990 | base_bits) as u16;
6241        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6242        let mut bytes = hw1.to_le_bytes().to_vec();
6243        bytes.extend_from_slice(&hw2.to_le_bytes());
6244        Ok(bytes)
6245    }
6246
6247    /// Encode Thumb-2 32-bit LDRSB with register: LDRSB.W Rd, [Rn, Rm]
6248    fn encode_thumb32_ldrsb_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6249        let rd_bits = reg_to_bits(rd);
6250        let base_bits = reg_to_bits(base);
6251        let rm_bits = reg_to_bits(offset_reg);
6252        // LDRSB.W Rd, [Rn, Rm, LSL #0]: 1111 1001 0001 Rn | Rt 0000 00 imm2 Rm
6253        let hw1: u16 = (0xF910 | base_bits) as u16;
6254        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6255        let mut bytes = hw1.to_le_bytes().to_vec();
6256        bytes.extend_from_slice(&hw2.to_le_bytes());
6257        Ok(bytes)
6258    }
6259
6260    /// Encode Thumb-2 32-bit LDRH with immediate: LDRH.W Rd, [Rn, #imm12]
6261    fn encode_thumb32_ldrh_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6262        let rd_bits = reg_to_bits(rd);
6263        let base_bits = reg_to_bits(base);
6264        // LDRH.W Rd, [Rn, #imm12]: 1111 1000 1011 Rn | Rt imm12
6265        let hw1: u16 = (0xF8B0 | base_bits) as u16;
6266        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6267        let mut bytes = hw1.to_le_bytes().to_vec();
6268        bytes.extend_from_slice(&hw2.to_le_bytes());
6269        Ok(bytes)
6270    }
6271
6272    /// Encode Thumb-2 32-bit LDRH with register: LDRH.W Rd, [Rn, Rm]
6273    fn encode_thumb32_ldrh_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6274        let rd_bits = reg_to_bits(rd);
6275        let base_bits = reg_to_bits(base);
6276        let rm_bits = reg_to_bits(offset_reg);
6277        // LDRH.W Rd, [Rn, Rm, LSL #0]: 1111 1000 0011 Rn | Rt 0000 00 imm2 Rm
6278        let hw1: u16 = (0xF830 | base_bits) as u16;
6279        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6280        let mut bytes = hw1.to_le_bytes().to_vec();
6281        bytes.extend_from_slice(&hw2.to_le_bytes());
6282        Ok(bytes)
6283    }
6284
6285    /// Encode Thumb-2 32-bit LDRSH with immediate: LDRSH.W Rd, [Rn, #imm12]
6286    fn encode_thumb32_ldrsh_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6287        let rd_bits = reg_to_bits(rd);
6288        let base_bits = reg_to_bits(base);
6289        // LDRSH.W Rd, [Rn, #imm12]: 1111 1001 1011 Rn | Rt imm12
6290        let hw1: u16 = (0xF9B0 | base_bits) as u16;
6291        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6292        let mut bytes = hw1.to_le_bytes().to_vec();
6293        bytes.extend_from_slice(&hw2.to_le_bytes());
6294        Ok(bytes)
6295    }
6296
6297    /// Encode Thumb-2 32-bit LDRSH with register: LDRSH.W Rd, [Rn, Rm]
6298    fn encode_thumb32_ldrsh_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6299        let rd_bits = reg_to_bits(rd);
6300        let base_bits = reg_to_bits(base);
6301        let rm_bits = reg_to_bits(offset_reg);
6302        // LDRSH.W Rd, [Rn, Rm, LSL #0]: 1111 1001 0011 Rn | Rt 0000 00 imm2 Rm
6303        let hw1: u16 = (0xF930 | base_bits) as u16;
6304        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6305        let mut bytes = hw1.to_le_bytes().to_vec();
6306        bytes.extend_from_slice(&hw2.to_le_bytes());
6307        Ok(bytes)
6308    }
6309
6310    /// Encode Thumb-2 32-bit STRB with immediate: STRB.W Rd, [Rn, #imm12]
6311    fn encode_thumb32_strb_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6312        let rd_bits = reg_to_bits(rd);
6313        let base_bits = reg_to_bits(base);
6314        // STRB.W Rd, [Rn, #imm12]: 1111 1000 1000 Rn | Rt imm12
6315        let hw1: u16 = (0xF880 | base_bits) as u16;
6316        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6317        let mut bytes = hw1.to_le_bytes().to_vec();
6318        bytes.extend_from_slice(&hw2.to_le_bytes());
6319        Ok(bytes)
6320    }
6321
6322    /// Encode Thumb-2 32-bit STRB with register: STRB.W Rd, [Rn, Rm]
6323    fn encode_thumb32_strb_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6324        let rd_bits = reg_to_bits(rd);
6325        let base_bits = reg_to_bits(base);
6326        let rm_bits = reg_to_bits(offset_reg);
6327        // STRB.W Rd, [Rn, Rm, LSL #0]: 1111 1000 0000 Rn | Rt 0000 00 imm2 Rm
6328        let hw1: u16 = (0xF800 | base_bits) as u16;
6329        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6330        let mut bytes = hw1.to_le_bytes().to_vec();
6331        bytes.extend_from_slice(&hw2.to_le_bytes());
6332        Ok(bytes)
6333    }
6334
6335    /// Encode Thumb-2 32-bit STRH with immediate: STRH.W Rd, [Rn, #imm12]
6336    fn encode_thumb32_strh_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6337        let rd_bits = reg_to_bits(rd);
6338        let base_bits = reg_to_bits(base);
6339        // STRH.W Rd, [Rn, #imm12]: 1111 1000 1010 Rn | Rt imm12
6340        let hw1: u16 = (0xF8A0 | base_bits) as u16;
6341        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6342        let mut bytes = hw1.to_le_bytes().to_vec();
6343        bytes.extend_from_slice(&hw2.to_le_bytes());
6344        Ok(bytes)
6345    }
6346
6347    /// Encode Thumb-2 32-bit STRH with register: STRH.W Rd, [Rn, Rm]
6348    fn encode_thumb32_strh_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6349        let rd_bits = reg_to_bits(rd);
6350        let base_bits = reg_to_bits(base);
6351        let rm_bits = reg_to_bits(offset_reg);
6352        // STRH.W Rd, [Rn, Rm, LSL #0]: 1111 1000 0010 Rn | Rt 0000 00 imm2 Rm
6353        let hw1: u16 = (0xF820 | base_bits) as u16;
6354        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6355        let mut bytes = hw1.to_le_bytes().to_vec();
6356        bytes.extend_from_slice(&hw2.to_le_bytes());
6357        Ok(bytes)
6358    }
6359
6360    /// Encode Thumb-2 32-bit ADD with immediate: ADD.W Rd, Rn, #imm
6361    fn encode_thumb32_add_imm(&self, rd: &Reg, rn: &Reg, imm: u32) -> Result<Vec<u8>> {
6362        let rd_bits = reg_to_bits(rd);
6363        let rn_bits = reg_to_bits(rn);
6364
6365        // For small immediates, use ADD.W Rd, Rn, #imm12
6366        // Encoding: 1111 0 i 0 1 0 0 0 S Rn | 0 imm3 Rd imm8
6367        // S = 0 (don't update flags)
6368        // The 12-bit immediate is encoded as: i:imm3:imm8
6369        // For simplicity, we only support imm <= 0xFFF (direct encoding)
6370        if imm <= 0xFFF {
6371            let i_bit = (imm >> 11) & 1;
6372            let imm3 = (imm >> 8) & 0x7;
6373            let imm8 = imm & 0xFF;
6374
6375            let hw1: u16 = (0xF100 | (i_bit << 10) | rn_bits) as u16;
6376            let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
6377
6378            let mut bytes = hw1.to_le_bytes().to_vec();
6379            bytes.extend_from_slice(&hw2.to_le_bytes());
6380            Ok(bytes)
6381        } else {
6382            // For larger immediates, would need MOVW/MOVT + ADD
6383            // For now, return error
6384            Err(synth_core::Error::synthesis(
6385                "ADD immediate too large for single instruction",
6386            ))
6387        }
6388    }
6389
6390    // === Raw encoding helpers for POPCNT (take register numbers directly) ===
6391
6392    /// Encode Thumb-2 32-bit MOVW (16-bit immediate) - raw version
6393    ///
6394    /// # Contract (Verus-style)
6395    /// ```text
6396    /// requires rd <= 14, imm16 <= 0xFFFF
6397    /// ensures result.len() == 4
6398    /// ```
6399    fn encode_thumb32_movw_raw(&self, rd: u32, imm16: u32) -> Result<Vec<u8>> {
6400        reg_bits_checked(rd)?;
6401        encoding_contracts::verify_imm16(imm16);
6402        // MOVW Rd, #imm16
6403        // 1111 0 i 10 0 1 0 0 imm4 | 0 imm3 Rd imm8
6404        let imm16 = imm16 & 0xFFFF;
6405        let imm4 = (imm16 >> 12) & 0xF;
6406        let i_bit = (imm16 >> 11) & 1;
6407        let imm3 = (imm16 >> 8) & 0x7;
6408        let imm8 = imm16 & 0xFF;
6409
6410        let hw1: u16 = (0xF240 | (i_bit << 10) | imm4) as u16;
6411        let hw2: u16 = ((imm3 << 12) | (rd << 8) | imm8) as u16;
6412
6413        let mut bytes = hw1.to_le_bytes().to_vec();
6414        bytes.extend_from_slice(&hw2.to_le_bytes());
6415        encoding_contracts::verify_thumb32(&bytes);
6416        Ok(bytes)
6417    }
6418
6419    /// Encode Thumb-2 32-bit MOVT (move top 16 bits) - raw version
6420    ///
6421    /// # Contract (Verus-style)
6422    /// ```text
6423    /// requires rd <= 14, imm16 <= 0xFFFF
6424    /// ensures result.len() == 4
6425    /// ```
6426    fn encode_thumb32_movt_raw(&self, rd: u32, imm16: u32) -> Result<Vec<u8>> {
6427        reg_bits_checked(rd)?;
6428        encoding_contracts::verify_imm16(imm16);
6429        // MOVT Rd, #imm16
6430        // 1111 0 i 10 1 1 0 0 imm4 | 0 imm3 Rd imm8
6431        let imm16 = imm16 & 0xFFFF;
6432        let imm4 = (imm16 >> 12) & 0xF;
6433        let i_bit = (imm16 >> 11) & 1;
6434        let imm3 = (imm16 >> 8) & 0x7;
6435        let imm8 = imm16 & 0xFF;
6436
6437        let hw1: u16 = (0xF2C0 | (i_bit << 10) | imm4) as u16;
6438        let hw2: u16 = ((imm3 << 12) | (rd << 8) | imm8) as u16;
6439
6440        let mut bytes = hw1.to_le_bytes().to_vec();
6441        bytes.extend_from_slice(&hw2.to_le_bytes());
6442        encoding_contracts::verify_thumb32(&bytes);
6443        Ok(bytes)
6444    }
6445
6446    /// Encode Thumb-2 32-bit LSR (logical shift right) with immediate - raw version
6447    fn encode_thumb32_lsr_raw(&self, rd: u32, rm: u32, shift: u32) -> Result<Vec<u8>> {
6448        // MOV.W Rd, Rm, LSR #imm
6449        // EA4F 0 imm3 Rd imm2 01 Rm
6450        let imm5 = shift & 0x1F;
6451        let imm2 = imm5 & 0x3;
6452        let imm3 = (imm5 >> 2) & 0x7;
6453
6454        let hw1: u16 = 0xEA4F;
6455        let hw2: u16 = ((imm3 << 12) | (rd << 8) | (imm2 << 6) | (0b01 << 4) | rm) as u16;
6456
6457        let mut bytes = hw1.to_le_bytes().to_vec();
6458        bytes.extend_from_slice(&hw2.to_le_bytes());
6459        Ok(bytes)
6460    }
6461
6462    /// Encode Thumb-2 32-bit AND (register) - raw version
6463    fn encode_thumb32_and_reg_raw(&self, rd: u32, rn: u32, rm: u32) -> Result<Vec<u8>> {
6464        // AND.W Rd, Rn, Rm
6465        // EA00 Rn | 0 Rd 00 00 Rm
6466        let hw1: u16 = (0xEA00 | rn) as u16;
6467        let hw2: u16 = ((rd << 8) | rm) as u16;
6468
6469        let mut bytes = hw1.to_le_bytes().to_vec();
6470        bytes.extend_from_slice(&hw2.to_le_bytes());
6471        Ok(bytes)
6472    }
6473
6474    /// Encode Thumb-2 32-bit AND with immediate - raw version
6475    fn encode_thumb32_and_imm_raw(&self, rd: u32, rn: u32, imm: u32) -> Result<Vec<u8>> {
6476        // AND.W Rd, Rn, #<modified_immediate>
6477        // For small immediates (0-255), the encoding is simpler
6478        // F0 00 Rn | 0 imm3 Rd imm8
6479        let i_bit = (imm >> 11) & 1;
6480        let imm3 = (imm >> 8) & 0x7;
6481        let imm8 = imm & 0xFF;
6482
6483        let hw1: u16 = (0xF000 | (i_bit << 10) | rn) as u16;
6484        let hw2: u16 = ((imm3 << 12) | (rd << 8) | imm8) as u16;
6485
6486        let mut bytes = hw1.to_le_bytes().to_vec();
6487        bytes.extend_from_slice(&hw2.to_le_bytes());
6488        Ok(bytes)
6489    }
6490
6491    /// Encode Thumb-2 32-bit SUB (register) - raw version
6492    fn encode_thumb32_sub_reg_raw(&self, rd: u32, rn: u32, rm: u32) -> Result<Vec<u8>> {
6493        // SUB.W Rd, Rn, Rm
6494        // EBA0 Rn | 0 Rd 00 00 Rm
6495        let hw1: u16 = (0xEBA0 | rn) as u16;
6496        let hw2: u16 = ((rd << 8) | rm) as u16;
6497
6498        let mut bytes = hw1.to_le_bytes().to_vec();
6499        bytes.extend_from_slice(&hw2.to_le_bytes());
6500        Ok(bytes)
6501    }
6502
6503    /// Encode Thumb-2 32-bit ADD (register) - raw version
6504    fn encode_thumb32_add_reg_raw(&self, rd: u32, rn: u32, rm: u32) -> Result<Vec<u8>> {
6505        // ADD.W Rd, Rn, Rm
6506        // EB00 Rn | 0 Rd 00 00 Rm
6507        let hw1: u16 = (0xEB00 | rn) as u16;
6508        let hw2: u16 = ((rd << 8) | rm) as u16;
6509
6510        let mut bytes = hw1.to_le_bytes().to_vec();
6511        bytes.extend_from_slice(&hw2.to_le_bytes());
6512        Ok(bytes)
6513    }
6514
6515    /// Encode Thumb-2 32-bit ADDS (register, flag-setting) - raw version.
6516    /// Used as the high-register fallback for `ArmOp::Adds` (i64 low-word add)
6517    /// so R8-R11 pair operands don't overflow the 16-bit field — #178/#180.
6518    fn encode_thumb32_adds_reg_raw(&self, rd: u32, rn: u32, rm: u32) -> Result<Vec<u8>> {
6519        // ADDS.W Rd, Rn, Rm (T3, S=1): EB10 Rn | 0 Rd 00 00 Rm
6520        let hw1: u16 = (0xEB10 | rn) as u16;
6521        let hw2: u16 = ((rd << 8) | rm) as u16;
6522        let mut bytes = hw1.to_le_bytes().to_vec();
6523        bytes.extend_from_slice(&hw2.to_le_bytes());
6524        Ok(bytes)
6525    }
6526
6527    /// Encode Thumb-2 32-bit SUBS (register, flag-setting) - raw version.
6528    /// High-register fallback for `ArmOp::Subs` (i64 low-word subtract) — #178/#180.
6529    fn encode_thumb32_subs_reg_raw(&self, rd: u32, rn: u32, rm: u32) -> Result<Vec<u8>> {
6530        // SUBS.W Rd, Rn, Rm (T3, S=1): EBB0 Rn | 0 Rd 00 00 Rm
6531        let hw1: u16 = (0xEBB0 | rn) as u16;
6532        let hw2: u16 = ((rd << 8) | rm) as u16;
6533        let mut bytes = hw1.to_le_bytes().to_vec();
6534        bytes.extend_from_slice(&hw2.to_le_bytes());
6535        Ok(bytes)
6536    }
6537
6538    /// Encode a sequence of ARM instructions
6539    pub fn encode_sequence(&self, ops: &[ArmOp]) -> Result<Vec<u8>> {
6540        let mut code = Vec::new();
6541
6542        for op in ops {
6543            let encoded = self.encode(op)?;
6544            code.extend_from_slice(&encoded);
6545        }
6546
6547        Ok(code)
6548    }
6549}
6550
6551/// Convert register to bit encoding (0-15)
6552fn reg_to_bits(reg: &Reg) -> u32 {
6553    match reg {
6554        Reg::R0 => 0,
6555        Reg::R1 => 1,
6556        Reg::R2 => 2,
6557        Reg::R3 => 3,
6558        Reg::R4 => 4,
6559        Reg::R5 => 5,
6560        Reg::R6 => 6,
6561        Reg::R7 => 7,
6562        Reg::R8 => 8,
6563        Reg::R9 => 9,
6564        Reg::R10 => 10,
6565        Reg::R11 => 11,
6566        Reg::R12 => 12,
6567        Reg::SP => 13,
6568        Reg::LR => 14,
6569        Reg::PC => 15,
6570    }
6571}
6572
6573/// Fallible form of the `verify_reg_bits` contract. PC (R15) is not a valid
6574/// data operand for the Thumb-2 encodings that use this guard (SDIV/UDIV/MLS/…
6575/// are UNPREDICTABLE with PC). Synth's own codegen never emits PC there, but
6576/// the encoder must stay *total* over arbitrary `ArmOp` inputs — the fuzz
6577/// harness (`encoder_no_panic`) requires Ok-or-Err, never a panic. Pre-fix, the
6578/// `debug_assert` in `verify_reg_bits` aborted under `-Cdebug-assertions`.
6579/// Returns a typed Err instead. See #185.
6580fn reg_bits_checked(bits: u32) -> Result<()> {
6581    if bits > 14 {
6582        return Err(synth_core::Error::synthesis(format!(
6583            "register bits {bits} (PC/R15) is not a valid operand for this Thumb-2 encoding"
6584        )));
6585    }
6586    Ok(())
6587}
6588
6589/// Try to encode a 32-bit value as an ARM rotated immediate (imm8 ROR 2*rot4).
6590/// Returns Some((encoded_bits, 1)) if representable, None otherwise.
6591fn try_encode_rotated_imm(val: u32) -> Option<(u32, u32)> {
6592    if val == 0 {
6593        return Some((0, 1));
6594    }
6595    for rot in 0..16u32 {
6596        let shift = rot * 2;
6597        // Rotate left by shift (undo the ROR) to see if result fits in 8 bits
6598        let unrotated = val.rotate_left(shift);
6599        if unrotated <= 0xFF {
6600            // Encoded as: rot4(4 bits) | imm8(8 bits) = rotate_imm << 8 | imm8
6601            return Some(((rot << 8) | unrotated, 1));
6602        }
6603    }
6604    None
6605}
6606
6607/// Encode operand2 field and return (bits, immediate_flag).
6608/// For ARM32 mode, immediates use the rotated-immediate encoding (imm8 ROR 2*rot4).
6609/// Panics if an immediate value cannot be represented. Callers that need large
6610/// immediates should use MOVW/MOVT instead of Operand2::Imm.
6611fn encode_operand2(op2: &Operand2) -> (u32, u32) {
6612    match op2 {
6613        Operand2::Imm(val) => {
6614            let uval = *val as u32;
6615            // Attempt rotated-immediate encoding (ARM32 Operand2)
6616            if let Some(encoded) = try_encode_rotated_imm(uval) {
6617                encoded
6618            } else {
6619                // Fallback: mask to 8 bits (legacy behavior for values that
6620                // cannot be represented). This should not be reached for
6621                // correctly-selected instructions; the instruction selector
6622                // must use MOVW/MOVT for large constants.
6623                let imm = uval & 0xFF;
6624                (imm, 1)
6625            }
6626        }
6627
6628        Operand2::Reg(reg) => {
6629            let reg_bits = reg_to_bits(reg);
6630            (reg_bits, 0) // I=0 for register
6631        }
6632
6633        Operand2::RegShift {
6634            rm,
6635            shift: _,
6636            amount,
6637        } => {
6638            // Simplified encoding with shift
6639            let rm_bits = reg_to_bits(rm);
6640            let shift_bits = (*amount & 0x1F) << 7;
6641            (shift_bits | rm_bits, 0)
6642        }
6643    }
6644}
6645
6646/// Encode memory address to (base_reg, offset)
6647fn encode_mem_addr(addr: &MemAddr) -> (u32, u32) {
6648    let base_bits = reg_to_bits(&addr.base);
6649    let offset_bits = (addr.offset as u32) & 0xFFF; // 12-bit offset
6650    (base_bits, offset_bits)
6651}
6652
6653/// S-register number: S0=0, S1=1, ..., S31=31
6654fn vfp_sreg_to_num(reg: &VfpReg) -> Result<u32> {
6655    match reg {
6656        VfpReg::S0 => Ok(0),
6657        VfpReg::S1 => Ok(1),
6658        VfpReg::S2 => Ok(2),
6659        VfpReg::S3 => Ok(3),
6660        VfpReg::S4 => Ok(4),
6661        VfpReg::S5 => Ok(5),
6662        VfpReg::S6 => Ok(6),
6663        VfpReg::S7 => Ok(7),
6664        VfpReg::S8 => Ok(8),
6665        VfpReg::S9 => Ok(9),
6666        VfpReg::S10 => Ok(10),
6667        VfpReg::S11 => Ok(11),
6668        VfpReg::S12 => Ok(12),
6669        VfpReg::S13 => Ok(13),
6670        VfpReg::S14 => Ok(14),
6671        VfpReg::S15 => Ok(15),
6672        VfpReg::S16 => Ok(16),
6673        VfpReg::S17 => Ok(17),
6674        VfpReg::S18 => Ok(18),
6675        VfpReg::S19 => Ok(19),
6676        VfpReg::S20 => Ok(20),
6677        VfpReg::S21 => Ok(21),
6678        VfpReg::S22 => Ok(22),
6679        VfpReg::S23 => Ok(23),
6680        VfpReg::S24 => Ok(24),
6681        VfpReg::S25 => Ok(25),
6682        VfpReg::S26 => Ok(26),
6683        VfpReg::S27 => Ok(27),
6684        VfpReg::S28 => Ok(28),
6685        VfpReg::S29 => Ok(29),
6686        VfpReg::S30 => Ok(30),
6687        VfpReg::S31 => Ok(31),
6688        // D-registers are not used in F32 single-precision encodings
6689        _ => Err(synth_core::Error::SynthesisError(
6690            "D-register not supported in single-precision VFP encoding".to_string(),
6691        )),
6692    }
6693}
6694
6695/// D-register number: D0=0, D1=1, ..., D15=15
6696fn vfp_dreg_to_num(reg: &VfpReg) -> Result<u32> {
6697    match reg {
6698        VfpReg::D0 => Ok(0),
6699        VfpReg::D1 => Ok(1),
6700        VfpReg::D2 => Ok(2),
6701        VfpReg::D3 => Ok(3),
6702        VfpReg::D4 => Ok(4),
6703        VfpReg::D5 => Ok(5),
6704        VfpReg::D6 => Ok(6),
6705        VfpReg::D7 => Ok(7),
6706        VfpReg::D8 => Ok(8),
6707        VfpReg::D9 => Ok(9),
6708        VfpReg::D10 => Ok(10),
6709        VfpReg::D11 => Ok(11),
6710        VfpReg::D12 => Ok(12),
6711        VfpReg::D13 => Ok(13),
6712        VfpReg::D14 => Ok(14),
6713        VfpReg::D15 => Ok(15),
6714        // S-registers are not used in F64 double-precision encodings
6715        _ => Err(synth_core::Error::SynthesisError(
6716            "S-register not supported in double-precision VFP encoding".to_string(),
6717        )),
6718    }
6719}
6720
6721/// Split S-register into (Vx[3:0], qualifier_bit) for VFP encoding.
6722/// For an S-register number s: Vx = s >> 1, qualifier = s & 1.
6723/// The qualifier bit goes to D (bit 22), N (bit 7), or M (bit 5) depending on role.
6724fn encode_sreg(s: u32) -> (u32, u32) {
6725    (s >> 1, s & 1)
6726}
6727
6728/// Split D-register into (Vx[3:0], qualifier_bit) for VFP double-precision encoding.
6729/// For a D-register number d: Vx = d & 0xF, qualifier = (d >> 4) & 1.
6730/// For D0-D15, qualifier is always 0.
6731fn encode_dreg(d: u32) -> (u32, u32) {
6732    (d & 0xF, (d >> 4) & 1)
6733}
6734
6735/// Encode a VFP 3-register arithmetic instruction (VADD.F32, VSUB.F32, VMUL.F32, VDIV.F32).
6736/// Returns the full 32-bit instruction word.
6737///
6738/// VFP encoding: [cond 1110] [D opc1 Vn] [Vd 101 sz] [N opc2 M 0 Vm]
6739/// For single-precision (sz=0), coprocessor = 0xA (bits[11:8]).
6740fn encode_vfp_3reg(base: u32, sd: &VfpReg, sn: &VfpReg, sm: &VfpReg) -> Result<u32> {
6741    let sd_num = vfp_sreg_to_num(sd)?;
6742    let sn_num = vfp_sreg_to_num(sn)?;
6743    let sm_num = vfp_sreg_to_num(sm)?;
6744    let (vd, d) = encode_sreg(sd_num);
6745    let (vn, n) = encode_sreg(sn_num);
6746    let (vm, m) = encode_sreg(sm_num);
6747
6748    Ok(base | (d << 22) | (vn << 16) | (vd << 12) | (n << 7) | (m << 5) | vm)
6749}
6750
6751/// Encode a VFP 2-register instruction (VNEG.F32, VABS.F32, VSQRT.F32).
6752/// Returns the full 32-bit instruction word.
6753fn encode_vfp_2reg(base: u32, sd: &VfpReg, sm: &VfpReg) -> Result<u32> {
6754    let sd_num = vfp_sreg_to_num(sd)?;
6755    let sm_num = vfp_sreg_to_num(sm)?;
6756    let (vd, d) = encode_sreg(sd_num);
6757    let (vm, m) = encode_sreg(sm_num);
6758
6759    Ok(base | (d << 22) | (vd << 12) | (m << 5) | vm)
6760}
6761
6762/// Encode a VFP load/store (VLDR.F32 / VSTR.F32).
6763/// offset is in bytes and must be word-aligned; encoded as imm8 = offset/4.
6764/// U bit (bit 23) controls add/subtract offset.
6765fn encode_vfp_ldst(base: u32, sd: &VfpReg, addr: &MemAddr) -> Result<u32> {
6766    let sd_num = vfp_sreg_to_num(sd)?;
6767    let (vd, d) = encode_sreg(sd_num);
6768    let rn = reg_to_bits(&addr.base);
6769
6770    let offset = addr.offset;
6771    let u_bit = if offset >= 0 { 1u32 } else { 0u32 };
6772    let abs_offset = offset.unsigned_abs();
6773    let imm8 = (abs_offset / 4) & 0xFF;
6774
6775    Ok(base | (u_bit << 23) | (d << 22) | (rn << 16) | (vd << 12) | imm8)
6776}
6777
6778/// Encode VMOV between core register and S-register.
6779/// VMOV Sn, Rt: 0xEE00_0A10 | (Vn << 16) | (N << 7) | (Rt << 12)
6780/// VMOV Rt, Sn: 0xEE10_0A10 | (Vn << 16) | (N << 7) | (Rt << 12)
6781fn encode_vmov_core_sreg(to_sreg: bool, sreg: &VfpReg, core: &Reg) -> Result<u32> {
6782    let s_num = vfp_sreg_to_num(sreg)?;
6783    let (vn, n) = encode_sreg(s_num);
6784    let rt = reg_to_bits(core);
6785
6786    let base = if to_sreg { 0xEE000A10 } else { 0xEE100A10 };
6787    Ok(base | (vn << 16) | (rt << 12) | (n << 7))
6788}
6789
6790/// Encode a VFP 3-register double-precision instruction (VADD.F64, VSUB.F64, etc.).
6791/// For double-precision (sz=1), coprocessor = 0xB (bits[11:8]).
6792/// The base should have bit 8 = 1 for F64 (0xB suffix instead of 0xA).
6793fn encode_vfp_3reg_f64(base: u32, dd: &VfpReg, dn: &VfpReg, dm: &VfpReg) -> Result<u32> {
6794    let dd_num = vfp_dreg_to_num(dd)?;
6795    let dn_num = vfp_dreg_to_num(dn)?;
6796    let dm_num = vfp_dreg_to_num(dm)?;
6797    let (vd, d) = encode_dreg(dd_num);
6798    let (vn, n) = encode_dreg(dn_num);
6799    let (vm, m) = encode_dreg(dm_num);
6800
6801    Ok(base | (d << 22) | (vn << 16) | (vd << 12) | (n << 7) | (m << 5) | vm)
6802}
6803
6804/// Encode a VFP 2-register double-precision instruction (VNEG.F64, VABS.F64, VSQRT.F64).
6805fn encode_vfp_2reg_f64(base: u32, dd: &VfpReg, dm: &VfpReg) -> Result<u32> {
6806    let dd_num = vfp_dreg_to_num(dd)?;
6807    let dm_num = vfp_dreg_to_num(dm)?;
6808    let (vd, d) = encode_dreg(dd_num);
6809    let (vm, m) = encode_dreg(dm_num);
6810
6811    Ok(base | (d << 22) | (vd << 12) | (m << 5) | vm)
6812}
6813
6814/// Encode a VFP load/store for double-precision (VLDR.64 / VSTR.64).
6815/// offset is in bytes and must be word-aligned; encoded as imm8 = offset/4.
6816fn encode_vfp_ldst_f64(base: u32, dd: &VfpReg, addr: &MemAddr) -> Result<u32> {
6817    let dd_num = vfp_dreg_to_num(dd)?;
6818    let (vd, d) = encode_dreg(dd_num);
6819    let rn = reg_to_bits(&addr.base);
6820
6821    let offset = addr.offset;
6822    let u_bit = if offset >= 0 { 1u32 } else { 0u32 };
6823    let abs_offset = offset.unsigned_abs();
6824    let imm8 = (abs_offset / 4) & 0xFF;
6825
6826    Ok(base | (u_bit << 23) | (d << 22) | (rn << 16) | (vd << 12) | imm8)
6827}
6828
6829/// Encode VMOV between two core registers and a D-register.
6830/// VMOV Dm, Rt, Rt2: 0xEC40_0B10 | (Rt2 << 16) | (Rt << 12) | (M << 5) | Vm
6831/// VMOV Rt, Rt2, Dm: 0xEC50_0B10 | (Rt2 << 16) | (Rt << 12) | (M << 5) | Vm
6832fn encode_vmov_core_dreg(
6833    to_dreg: bool,
6834    dreg: &VfpReg,
6835    core_lo: &Reg,
6836    core_hi: &Reg,
6837) -> Result<u32> {
6838    let d_num = vfp_dreg_to_num(dreg)?;
6839    let (vm, m) = encode_dreg(d_num);
6840    let rt = reg_to_bits(core_lo);
6841    let rt2 = reg_to_bits(core_hi);
6842
6843    let base = if to_dreg { 0xEC400B10 } else { 0xEC500B10 };
6844    Ok(base | (rt2 << 16) | (rt << 12) | (m << 5) | vm)
6845}
6846
6847/// Emit a VFP 32-bit instruction as Thumb-2 bytes (two LE halfwords).
6848fn vfp_to_thumb_bytes(instr: u32) -> Vec<u8> {
6849    let hw1 = ((instr >> 16) & 0xFFFF) as u16;
6850    let hw2 = (instr & 0xFFFF) as u16;
6851    let mut bytes = hw1.to_le_bytes().to_vec();
6852    bytes.extend_from_slice(&hw2.to_le_bytes());
6853    bytes
6854}
6855
6856// ============================================================================
6857// Helium MVE encoding helpers
6858// ============================================================================
6859
6860/// Q-register number: Q0=0, Q1=1, ..., Q7=7
6861fn qreg_to_num(reg: &QReg) -> u32 {
6862    match reg {
6863        QReg::Q0 => 0,
6864        QReg::Q1 => 1,
6865        QReg::Q2 => 2,
6866        QReg::Q3 => 3,
6867        QReg::Q4 => 4,
6868        QReg::Q5 => 5,
6869        QReg::Q6 => 6,
6870        QReg::Q7 => 7,
6871    }
6872}
6873
6874/// MVE element size to encoding bits: S8=0b00, S16=0b01, S32=0b10
6875fn mve_size_bits(size: &MveSize) -> u32 {
6876    match size {
6877        MveSize::S8 => 0b00,
6878        MveSize::S16 => 0b01,
6879        MveSize::S32 => 0b10,
6880    }
6881}
6882
6883/// Encode MVE 3-register instruction.
6884/// Q-registers are encoded as D-register pairs: Q0=D0:D1, Q1=D2:D3, etc.
6885/// In NEON/MVE encoding, the Q-register uses D-register number = Qn * 2.
6886fn encode_mve_3reg(base: u32, qd: &QReg, qn: &QReg, qm: &QReg) -> u32 {
6887    let d = qreg_to_num(qd) * 2;
6888    let n = qreg_to_num(qn) * 2;
6889    let m = qreg_to_num(qm) * 2;
6890
6891    // Standard NEON/MVE 3-register encoding:
6892    // D bit (bit 22) = Vd[4], Vd[3:0] = bits [15:12]
6893    // N bit (bit 7)  = Vn[4], Vn[3:0] = bits [19:16]
6894    // M bit (bit 5)  = Vm[4], Vm[3:0] = bits [3:0]
6895    let vd = d & 0xF;
6896    let d_bit = (d >> 4) & 1;
6897    let vn = n & 0xF;
6898    let n_bit = (n >> 4) & 1;
6899    let vm = m & 0xF;
6900    let m_bit = (m >> 4) & 1;
6901
6902    base | (d_bit << 22) | (vn << 16) | (vd << 12) | (n_bit << 7) | (m_bit << 5) | vm
6903}
6904
6905/// Encode MVE 3-register bitwise instruction (VAND, VORR, VEOR, VBIC).
6906fn encode_mve_3reg_bitwise(base: u32, qd: &QReg, qn: &QReg, qm: &QReg) -> u32 {
6907    encode_mve_3reg(base, qd, qn, qm)
6908}
6909
6910/// Encode MVE VLDRW.32 Qd, [Rn, #offset]
6911/// Format: EC9x xxxx - contiguous load, word-sized elements
6912fn encode_mve_vldrw(qd: &QReg, addr: &MemAddr) -> u32 {
6913    let qd_enc = qreg_to_num(qd) * 2;
6914    let rn = reg_to_bits(&addr.base);
6915    let offset = addr.offset;
6916    let u_bit = if offset >= 0 { 1u32 } else { 0u32 };
6917    let abs_offset = offset.unsigned_abs();
6918    let imm7 = (abs_offset / 4) & 0x7F; // 7-bit word-aligned offset
6919
6920    // VLDRW.32 Qd, [Rn, #imm]: ED10 xx80 variant
6921    0xED100E80
6922        | (u_bit << 23)
6923        | ((qd_enc >> 4) << 22)
6924        | (rn << 16)
6925        | ((qd_enc & 0xF) << 12)
6926        | (imm7 & 0x7F)
6927}
6928
6929/// Encode MVE VSTRW.32 Qd, [Rn, #offset]
6930fn encode_mve_vstrw(qd: &QReg, addr: &MemAddr) -> u32 {
6931    let qd_enc = qreg_to_num(qd) * 2;
6932    let rn = reg_to_bits(&addr.base);
6933    let offset = addr.offset;
6934    let u_bit = if offset >= 0 { 1u32 } else { 0u32 };
6935    let abs_offset = offset.unsigned_abs();
6936    let imm7 = (abs_offset / 4) & 0x7F;
6937
6938    0xED000E80
6939        | (u_bit << 23)
6940        | ((qd_enc >> 4) << 22)
6941        | (rn << 16)
6942        | ((qd_enc & 0xF) << 12)
6943        | (imm7 & 0x7F)
6944}
6945
6946impl ArmEncoder {
6947    /// Encode MVE constant load: MOVW+MOVT+VMOV for each 32-bit word, then assemble Q-register
6948    fn encode_thumb_mve_const(&self, qd: &QReg, bytes: &[u8; 16]) -> Result<Vec<u8>> {
6949        let mut result = Vec::new();
6950        let qd_num = qreg_to_num(qd);
6951
6952        // Load each 32-bit word into R12 (temp) then VMOV into S-register
6953        for i in 0..4 {
6954            let word = u32::from_le_bytes([
6955                bytes[i * 4],
6956                bytes[i * 4 + 1],
6957                bytes[i * 4 + 2],
6958                bytes[i * 4 + 3],
6959            ]);
6960            let lo16 = word & 0xFFFF;
6961            let hi16 = (word >> 16) & 0xFFFF;
6962
6963            // MOVW R12, #lo16
6964            result.extend_from_slice(&self.encode_thumb32_movw_raw(12, lo16)?);
6965            // MOVT R12, #hi16
6966            if hi16 != 0 {
6967                result.extend_from_slice(&self.encode_thumb32_movt_raw(12, hi16)?);
6968            }
6969
6970            // VMOV Sn, R12 where Sn = Qd*4 + i
6971            let s_num = qd_num * 4 + i as u32;
6972            let (vn, n) = encode_sreg(s_num);
6973            let vmov: u32 = 0xEE000A10 | (vn << 16) | (12 << 12) | (n << 7);
6974            result.extend_from_slice(&vfp_to_thumb_bytes(vmov));
6975        }
6976
6977        Ok(result)
6978    }
6979
6980    /// Encode lane-wise f32 binary operation (VDIV, etc.) via S-register extraction
6981    fn encode_thumb_mve_lane_wise_f32_binop(
6982        &self,
6983        qd: &QReg,
6984        qn: &QReg,
6985        qm: &QReg,
6986        vfp_base: u32,
6987    ) -> Result<Vec<u8>> {
6988        let mut result = Vec::new();
6989        let qd_num = qreg_to_num(qd);
6990        let qn_num = qreg_to_num(qn);
6991        let qm_num = qreg_to_num(qm);
6992
6993        // For each lane 0..3: use S-registers directly (Q aliasing)
6994        for i in 0..4u32 {
6995            let sd = qd_num * 4 + i;
6996            let sn = qn_num * 4 + i;
6997            let sm = qm_num * 4 + i;
6998
6999            let (vd, d) = encode_sreg(sd);
7000            let (vn, n) = encode_sreg(sn);
7001            let (vm, m) = encode_sreg(sm);
7002
7003            let instr = vfp_base | (d << 22) | (vn << 16) | (vd << 12) | (n << 7) | (m << 5) | vm;
7004            result.extend_from_slice(&vfp_to_thumb_bytes(instr));
7005        }
7006
7007        Ok(result)
7008    }
7009
7010    /// Encode lane-wise f32 VSQRT via S-register extraction
7011    fn encode_thumb_mve_lane_wise_f32_sqrt(&self, qd: &QReg, qm: &QReg) -> Result<Vec<u8>> {
7012        let mut result = Vec::new();
7013        let qd_num = qreg_to_num(qd);
7014        let qm_num = qreg_to_num(qm);
7015
7016        // VSQRT.F32 base: 0xEEB10AC0
7017        for i in 0..4u32 {
7018            let sd = qd_num * 4 + i;
7019            let sm = qm_num * 4 + i;
7020
7021            let (vd, d) = encode_sreg(sd);
7022            let (vm, m) = encode_sreg(sm);
7023
7024            let instr: u32 = 0xEEB10AC0 | (d << 22) | (vd << 12) | (m << 5) | vm;
7025            result.extend_from_slice(&vfp_to_thumb_bytes(instr));
7026        }
7027
7028        Ok(result)
7029    }
7030}
7031
7032#[cfg(test)]
7033mod tests {
7034    use super::*;
7035
7036    #[test]
7037    fn test_encoder_creation() {
7038        let encoder_arm = ArmEncoder::new_arm32();
7039        assert!(!encoder_arm.thumb_mode);
7040
7041        let encoder_thumb = ArmEncoder::new_thumb2();
7042        assert!(encoder_thumb.thumb_mode);
7043    }
7044
7045    /// #204 WAKE-path regression: `SetCond` materialized 0/1 with the 16-bit
7046    /// `MOVS Rd,#imm` (T1), whose Rd field is 3 bits (R0–R7). For a high Rd
7047    /// (R8–R12) `rd_bits << 8` overflows bit 11, flipping the opcode MOVS→CMP
7048    /// (`0x2c00`), so the boolean was never written — gale's `has_waiter` kept a
7049    /// stale value and the binary-sem WAKE dispatch read garbage. High Rd must
7050    /// use the 32-bit `MOV.W` (T2). Verify the bytes, not the IR.
7051    #[test]
7052    fn test_encode_setcond_high_reg_uses_mov_w_204() {
7053        use synth_synthesis::{ArmOp, Condition, Reg};
7054        let enc = ArmEncoder::new_thumb2();
7055        // R12 (high): must be ITE + MOV.W #1 + MOV.W #0, never a 16-bit MOVS/CMP.
7056        let hi = enc
7057            .encode(&ArmOp::SetCond {
7058                rd: Reg::R12,
7059                cond: Condition::NE,
7060            })
7061            .unwrap();
7062        assert_eq!(hi.len(), 10, "ITE(2) + MOV.W(4) + MOV.W(4): {hi:02x?}");
7063        // both value halfwords are MOV.W (0xF04F) — NOT the corrupt CMP (0x2c..).
7064        assert_eq!(&hi[2..4], &[0x4F, 0xF0], "then = MOV.W: {hi:02x?}");
7065        assert_eq!(&hi[6..8], &[0x4F, 0xF0], "else = MOV.W: {hi:02x?}");
7066        assert_eq!(hi[4] & 0x0F, 0x01, "then imm = #1");
7067        assert_eq!(hi[8] & 0x0F, 0x00, "else imm = #0");
7068        // Low Rd keeps the compact 16-bit MOVS form.
7069        let lo = enc
7070            .encode(&ArmOp::SetCond {
7071                rd: Reg::R0,
7072                cond: Condition::NE,
7073            })
7074            .unwrap();
7075        assert_eq!(lo.len(), 6, "ITE(2) + MOVS(2) + MOVS(2): {lo:02x?}");
7076        assert_eq!(lo[2..4], [0x01, 0x20], "then = MOVS R0,#1");
7077        assert_eq!(lo[4..6], [0x00, 0x20], "else = MOVS R0,#0");
7078    }
7079
7080    /// #209 Opt 1b: UMULL RdLo, RdHi, Rn, Rm encodes correctly on both ISAs.
7081    /// Thumb-2 T1: 1111 1011 1010 Rn | RdLo RdHi 0000 Rm.
7082    /// A32:        cond 0000 1000 RdHi RdLo Rm 1001 Rn.
7083    #[test]
7084    fn test_encode_umull_209b() {
7085        use synth_synthesis::{ArmOp, Reg};
7086        let op = ArmOp::Umull {
7087            rdlo: Reg::R4,
7088            rdhi: Reg::R5,
7089            rn: Reg::R0,
7090            rm: Reg::R3,
7091        };
7092        // Thumb-2: hw1 = 0xFBA0 | 0 = 0xFBA0; hw2 = (4<<12)|(5<<8)|3 = 0x4503.
7093        let t = ArmEncoder::new_thumb2().encode(&op).unwrap();
7094        assert_eq!(
7095            t,
7096            vec![0xA0, 0xFB, 0x03, 0x45],
7097            "umull r4,r5,r0,r3 (T2): {t:02x?}"
7098        );
7099        // A32: 0xE0800090 | (5<<16) | (4<<12) | (3<<8) | 0 = 0xE0854390.
7100        let a = ArmEncoder::new_arm32().encode(&op).unwrap();
7101        assert_eq!(
7102            a,
7103            0xE085_4390u32.to_le_bytes().to_vec(),
7104            "umull (A32): {a:02x?}"
7105        );
7106    }
7107
7108    /// #206 regression: the ARM32 (A32) `Ldr`/`Str` encoders fed `addr` through
7109    /// `encode_mem_addr`, which returns only the 12-bit immediate — so a register
7110    /// offset (`[rn, rm, #off]`) was silently dropped to `[rn, #off]`, sending
7111    /// the access to the wrong runtime address (silent miscompile on the default
7112    /// `--target arm`). A register offset must materialize `ip = rn + rm` and
7113    /// load from `[ip, #off]`. Verify the bytes.
7114    #[test]
7115    fn test_encode_arm32_indexed_load_keeps_index_206() {
7116        use synth_synthesis::{ArmOp, MemAddr, Reg};
7117        let enc = ArmEncoder::new_arm32();
7118        // ldr r0, [r11, r1, #8]  must NOT collapse to a single immediate ldr.
7119        let bytes = enc
7120            .encode(&ArmOp::Ldr {
7121                rd: Reg::R0,
7122                addr: MemAddr::reg_imm(Reg::R11, Reg::R1, 8),
7123            })
7124            .unwrap();
7125        assert_eq!(
7126            bytes.len(),
7127            8,
7128            "expected ADD ip + LDR (2 words): {bytes:02x?}"
7129        );
7130        let add = u32::from_le_bytes(bytes[0..4].try_into().unwrap());
7131        let ldr = u32::from_le_bytes(bytes[4..8].try_into().unwrap());
7132        // ADD ip, r11, r1  = 0xE08BC001
7133        assert_eq!(add, 0xE08B_C001, "ADD ip,r11,r1: {add:#010x}");
7134        // LDR r0, [ip, #8] = 0xE59C0008
7135        assert_eq!(ldr, 0xE59C_0008, "LDR r0,[ip,#8]: {ldr:#010x}");
7136        // A bare immediate ldr (the bug) would be 0xE59B0008 (base=r11) — reject.
7137        assert_ne!(ldr, 0xE59B_0008, "index must not be dropped");
7138    }
7139
7140    /// #178/#180 regression: the Thumb `Add`/`Adds`/`Subs` reg-forms used the
7141    /// 16-bit encoding unconditionally. For high registers (R12 base scratch,
7142    /// R8-R11 i64 pairs) the 3-bit register fields overflow and corrupt the
7143    /// operands — `add ip,ip,r0` came out as `adds r4,r5,r1` (0x186C), silently
7144    /// dropping the address operand and miscompiling every optimized memory
7145    /// access. High registers must use the 32-bit `.W` forms.
7146    #[test]
7147    fn test_encode_thumb_add_high_reg_uses_add_w_178_180() {
7148        let encoder = ArmEncoder::new_thumb2();
7149
7150        // add ip, ip, r0  — the exact MemLoad/MemStore base+addr op.
7151        let code = encoder
7152            .encode(&ArmOp::Add {
7153                rd: Reg::R12,
7154                rn: Reg::R12,
7155                op2: Operand2::Reg(Reg::R0),
7156            })
7157            .unwrap();
7158        // ADD.W ip, ip, r0 = EB0C 0C00 (little-endian halfwords).
7159        assert_eq!(
7160            code,
7161            vec![0x0C, 0xEB, 0x00, 0x0C],
7162            "high-reg Thumb ADD must be 32-bit ADD.W (EB0C 0C00), not corrupt 16-bit; got {code:02X?}"
7163        );
7164        // Must NOT be the buggy 16-bit 0x186C (`adds r4,r5,r1`).
7165        assert_ne!(code, vec![0x6C, 0x18], "regressed to corrupt 16-bit ADDS");
7166
7167        // Low-register add stays 16-bit (no regression for the common case).
7168        let lo = encoder
7169            .encode(&ArmOp::Add {
7170                rd: Reg::R1,
7171                rn: Reg::R2,
7172                op2: Operand2::Reg(Reg::R3),
7173            })
7174            .unwrap();
7175        assert_eq!(
7176            lo.len(),
7177            2,
7178            "low-reg ADD should remain 16-bit, got {lo:02X?}"
7179        );
7180    }
7181
7182    /// #178/#180 sibling: i64 low-word `Adds`/`Subs` can land in R8-R11 pairs;
7183    /// those must fall back to 32-bit ADDS.W/SUBS.W (flag-setting preserved).
7184    #[test]
7185    fn test_encode_thumb_adds_subs_high_reg_use_32bit_178_180() {
7186        let encoder = ArmEncoder::new_thumb2();
7187
7188        // adds r10, r10, r8  → ADDS.W = EB1A 0A08
7189        let adds = encoder
7190            .encode(&ArmOp::Adds {
7191                rd: Reg::R10,
7192                rn: Reg::R10,
7193                op2: Operand2::Reg(Reg::R8),
7194            })
7195            .unwrap();
7196        assert_eq!(
7197            adds,
7198            vec![0x1A, 0xEB, 0x08, 0x0A],
7199            "high-reg ADDS must be 32-bit ADDS.W (EB1A 0A08); got {adds:02X?}"
7200        );
7201
7202        // subs r10, r10, r8  → SUBS.W = EBBA 0A08
7203        let subs = encoder
7204            .encode(&ArmOp::Subs {
7205                rd: Reg::R10,
7206                rn: Reg::R10,
7207                op2: Operand2::Reg(Reg::R8),
7208            })
7209            .unwrap();
7210        assert_eq!(
7211            subs,
7212            vec![0xBA, 0xEB, 0x08, 0x0A],
7213            "high-reg SUBS must be 32-bit SUBS.W (EBBA 0A08); got {subs:02X?}"
7214        );
7215    }
7216
7217    /// #184 (sibling of #180): 16-bit CMN (T1) only encodes R0-R7. High registers
7218    /// must use 32-bit CMN.W, not the corrupt truncated 16-bit form.
7219    #[test]
7220    fn test_encode_thumb_cmn_high_reg_uses_cmn_w_184() {
7221        let encoder = ArmEncoder::new_thumb2();
7222
7223        // cmn r10, r8  → CMN.W = EB1A 0F08 (ADD.W S=1, Rd=PC discarded).
7224        let cmn = encoder
7225            .encode(&ArmOp::Cmn {
7226                rn: Reg::R10,
7227                op2: Operand2::Reg(Reg::R8),
7228            })
7229            .unwrap();
7230        assert_eq!(
7231            cmn,
7232            vec![0x1A, 0xEB, 0x08, 0x0F],
7233            "high-reg CMN must be 32-bit CMN.W (EB1A 0F08); got {cmn:02X?}"
7234        );
7235
7236        // Low registers stay 16-bit: cmn r1, r2 = 0x42D1.
7237        let lo = encoder
7238            .encode(&ArmOp::Cmn {
7239                rn: Reg::R1,
7240                op2: Operand2::Reg(Reg::R2),
7241            })
7242            .unwrap();
7243        assert_eq!(
7244            lo.len(),
7245            2,
7246            "low-reg CMN should remain 16-bit, got {lo:02X?}"
7247        );
7248        assert_eq!(lo, vec![0xD1, 0x42], "low-reg CMN bytes wrong: {lo:02X?}");
7249    }
7250
7251    /// #185 regression: feeding PC (R15) as a data operand to a Thumb-2 op that
7252    /// guards its registers must return Err, not panic under debug-assertions.
7253    /// (Synth never emits PC here; the fuzz harness requires encode() be total.)
7254    #[test]
7255    fn test_encode_pc_operand_returns_err_not_panic_185() {
7256        let encoder = ArmEncoder::new_thumb2();
7257        for op in [
7258            ArmOp::Sdiv {
7259                rd: Reg::PC,
7260                rn: Reg::R0,
7261                rm: Reg::R1,
7262            },
7263            ArmOp::Udiv {
7264                rd: Reg::R0,
7265                rn: Reg::PC,
7266                rm: Reg::R1,
7267            },
7268            ArmOp::Sdiv {
7269                rd: Reg::R0,
7270                rn: Reg::R1,
7271                rm: Reg::PC,
7272            },
7273        ] {
7274            let r = encoder.encode(&op);
7275            assert!(
7276                r.is_err(),
7277                "encode({op:?}) must return Err for a PC operand, got {r:?}"
7278            );
7279        }
7280        // Valid registers still encode fine (no false rejection).
7281        assert!(
7282            encoder
7283                .encode(&ArmOp::Sdiv {
7284                    rd: Reg::R0,
7285                    rn: Reg::R1,
7286                    rm: Reg::R2
7287                })
7288                .is_ok()
7289        );
7290    }
7291
7292    #[test]
7293    fn test_encode_nop_arm32() {
7294        let encoder = ArmEncoder::new_arm32();
7295        let code = encoder.encode(&ArmOp::Nop).unwrap();
7296
7297        assert_eq!(code.len(), 4); // ARM32 instructions are 4 bytes
7298        assert_eq!(code, vec![0x00, 0x00, 0xA0, 0xE1]); // MOV R0, R0
7299    }
7300
7301    #[test]
7302    fn test_encode_nop_thumb() {
7303        let encoder = ArmEncoder::new_thumb2();
7304        let code = encoder.encode(&ArmOp::Nop).unwrap();
7305
7306        assert_eq!(code.len(), 2); // Thumb instructions are 2 bytes
7307        assert_eq!(code, vec![0x00, 0xBF]); // NOP
7308    }
7309
7310    #[test]
7311    fn test_encode_mov_immediate_arm32() {
7312        let encoder = ArmEncoder::new_arm32();
7313        let op = ArmOp::Mov {
7314            rd: Reg::R0,
7315            op2: Operand2::Imm(42),
7316        };
7317
7318        let code = encoder.encode(&op).unwrap();
7319        assert_eq!(code.len(), 4);
7320
7321        // Verify it's a MOV instruction (bits should have immediate flag set)
7322        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7323        assert_eq!(instr & 0x0E000000, 0x02000000); // Check I bit is set
7324    }
7325
7326    #[test]
7327    fn test_encode_add_registers_arm32() {
7328        let encoder = ArmEncoder::new_arm32();
7329        let op = ArmOp::Add {
7330            rd: Reg::R0,
7331            rn: Reg::R1,
7332            op2: Operand2::Reg(Reg::R2),
7333        };
7334
7335        let code = encoder.encode(&op).unwrap();
7336        assert_eq!(code.len(), 4);
7337
7338        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7339        // Verify it's an ADD instruction with correct opcode
7340        assert_eq!(instr & 0x0FE00000, 0x00800000);
7341    }
7342
7343    #[test]
7344    fn test_encode_ldr_arm32() {
7345        let encoder = ArmEncoder::new_arm32();
7346        let op = ArmOp::Ldr {
7347            rd: Reg::R0,
7348            addr: MemAddr::imm(Reg::R1, 4),
7349        };
7350
7351        let code = encoder.encode(&op).unwrap();
7352        assert_eq!(code.len(), 4);
7353
7354        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7355        // Verify load bit is set
7356        assert_eq!(instr & 0x00100000, 0x00100000);
7357    }
7358
7359    #[test]
7360    fn test_encode_str_arm32() {
7361        let encoder = ArmEncoder::new_arm32();
7362        let op = ArmOp::Str {
7363            rd: Reg::R0,
7364            addr: MemAddr::imm(Reg::SP, 0),
7365        };
7366
7367        let code = encoder.encode(&op).unwrap();
7368        assert_eq!(code.len(), 4);
7369    }
7370
7371    #[test]
7372    fn test_encode_branch_arm32() {
7373        let encoder = ArmEncoder::new_arm32();
7374        let op = ArmOp::Bl {
7375            label: "main".to_string(),
7376        };
7377
7378        let code = encoder.encode(&op).unwrap();
7379        assert_eq!(code.len(), 4);
7380
7381        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7382        // Verify BL opcode
7383        assert_eq!(instr & 0x0F000000, 0x0B000000);
7384    }
7385
7386    /// Regression test for #167 + #174: the Thumb-2 BL relocatable placeholder
7387    /// must carry a -4 addend so an R_ARM_THM_CALL nets to exactly the symbol S.
7388    /// The correct encoding is what `gas` emits for `bl <extern>`: f7ff fffe
7389    /// (hw1=0xF7FF, hw2=0xFFFE), little-endian bytes FF F7 FE FF.
7390    ///   - 0xD000 (J1=J2=0) → ~+0x600000 garbage addend: `bl c0000c` / truncated
7391    ///     to fit (#167).
7392    ///   - 0xF800 (addend 0) → lands at S+4, one instruction past the callee
7393    ///     entry (#174).
7394    ///   - 0xFFFE (addend -4) → lands at S. Correct.
7395    #[test]
7396    fn test_encode_thumb_bl_placeholder_addend_167_174() {
7397        let encoder = ArmEncoder::new_thumb2();
7398        let op = ArmOp::Bl {
7399            label: "callee".to_string(),
7400        };
7401
7402        let code = encoder.encode(&op).unwrap();
7403        assert_eq!(code.len(), 4, "Thumb-2 BL is 32-bit");
7404
7405        let hw1 = u16::from_le_bytes([code[0], code[1]]);
7406        let hw2 = u16::from_le_bytes([code[2], code[3]]);
7407        assert_eq!(hw1, 0xF7FF, "BL first halfword (matches gas `bl <extern>`)");
7408        assert_eq!(
7409            hw2, 0xFFFE,
7410            "BL second halfword must be 0xFFFE (-4 addend → nets to S), not 0xF800 (→ S+4, #174) or 0xD000 (#167)"
7411        );
7412        assert_ne!(hw2, 0xF800, "0xF800 (addend 0) lands at S+4 (#174)");
7413        assert_ne!(hw2, 0xD000, "0xD000 bakes in a ~+0x600000 addend (#167)");
7414    }
7415
7416    #[test]
7417    fn test_encode_sequence() {
7418        let encoder = ArmEncoder::new_arm32();
7419        let ops = vec![
7420            ArmOp::Mov {
7421                rd: Reg::R0,
7422                op2: Operand2::Imm(42),
7423            },
7424            ArmOp::Mov {
7425                rd: Reg::R1,
7426                op2: Operand2::Imm(10),
7427            },
7428            ArmOp::Add {
7429                rd: Reg::R2,
7430                rn: Reg::R0,
7431                op2: Operand2::Reg(Reg::R1),
7432            },
7433        ];
7434
7435        let code = encoder.encode_sequence(&ops).unwrap();
7436        assert_eq!(code.len(), 12); // 3 instructions * 4 bytes
7437    }
7438
7439    #[test]
7440    fn test_reg_to_bits() {
7441        assert_eq!(reg_to_bits(&Reg::R0), 0);
7442        assert_eq!(reg_to_bits(&Reg::R7), 7);
7443        assert_eq!(reg_to_bits(&Reg::SP), 13);
7444        assert_eq!(reg_to_bits(&Reg::LR), 14);
7445        assert_eq!(reg_to_bits(&Reg::PC), 15);
7446    }
7447
7448    #[test]
7449    fn test_encode_bitwise_operations() {
7450        let encoder = ArmEncoder::new_arm32();
7451
7452        let and_op = ArmOp::And {
7453            rd: Reg::R0,
7454            rn: Reg::R1,
7455            op2: Operand2::Reg(Reg::R2),
7456        };
7457        let and_code = encoder.encode(&and_op).unwrap();
7458        assert_eq!(and_code.len(), 4);
7459
7460        let orr_op = ArmOp::Orr {
7461            rd: Reg::R0,
7462            rn: Reg::R1,
7463            op2: Operand2::Reg(Reg::R2),
7464        };
7465        let orr_code = encoder.encode(&orr_op).unwrap();
7466        assert_eq!(orr_code.len(), 4);
7467
7468        let eor_op = ArmOp::Eor {
7469            rd: Reg::R0,
7470            rn: Reg::R1,
7471            op2: Operand2::Reg(Reg::R2),
7472        };
7473        let eor_code = encoder.encode(&eor_op).unwrap();
7474        assert_eq!(eor_code.len(), 4);
7475    }
7476
7477    // === Thumb-2 32-bit encoding tests ===
7478
7479    #[test]
7480    fn test_encode_sdiv_thumb2() {
7481        let encoder = ArmEncoder::new_thumb2();
7482        let op = ArmOp::Sdiv {
7483            rd: Reg::R0,
7484            rn: Reg::R1,
7485            rm: Reg::R2,
7486        };
7487
7488        let code = encoder.encode(&op).unwrap();
7489        assert_eq!(code.len(), 4); // 32-bit Thumb-2 instruction
7490
7491        // SDIV R0, R1, R2: 0xFB91 0xF0F2
7492        // First halfword: 0xFB90 | Rn(1) = 0xFB91
7493        // Second halfword: 0xF0F0 | Rd(0)<<8 | Rm(2) = 0xF0F2
7494        // Little-endian: [0x91, 0xFB, 0xF2, 0xF0]
7495        assert_eq!(code[0], 0x91);
7496        assert_eq!(code[1], 0xFB);
7497        assert_eq!(code[2], 0xF2);
7498        assert_eq!(code[3], 0xF0);
7499    }
7500
7501    #[test]
7502    fn test_encode_udiv_thumb2() {
7503        let encoder = ArmEncoder::new_thumb2();
7504        let op = ArmOp::Udiv {
7505            rd: Reg::R0,
7506            rn: Reg::R1,
7507            rm: Reg::R2,
7508        };
7509
7510        let code = encoder.encode(&op).unwrap();
7511        assert_eq!(code.len(), 4); // 32-bit Thumb-2 instruction
7512
7513        // UDIV R0, R1, R2: 0xFBB1 0xF0F2
7514        // Little-endian: [0xB1, 0xFB, 0xF2, 0xF0]
7515        assert_eq!(code[0], 0xB1);
7516        assert_eq!(code[1], 0xFB);
7517        assert_eq!(code[2], 0xF2);
7518        assert_eq!(code[3], 0xF0);
7519    }
7520
7521    #[test]
7522    fn test_encode_mul_thumb2() {
7523        let encoder = ArmEncoder::new_thumb2();
7524        let op = ArmOp::Mul {
7525            rd: Reg::R0,
7526            rn: Reg::R1,
7527            rm: Reg::R2,
7528        };
7529
7530        let code = encoder.encode(&op).unwrap();
7531        assert_eq!(code.len(), 4); // 32-bit Thumb-2 instruction
7532    }
7533
7534    #[test]
7535    fn test_encode_and_thumb2() {
7536        let encoder = ArmEncoder::new_thumb2();
7537        let op = ArmOp::And {
7538            rd: Reg::R0,
7539            rn: Reg::R1,
7540            op2: Operand2::Reg(Reg::R2),
7541        };
7542
7543        let code = encoder.encode(&op).unwrap();
7544        assert_eq!(code.len(), 4); // 32-bit Thumb-2 instruction
7545    }
7546
7547    #[test]
7548    fn test_encode_lsl_thumb2_low_regs() {
7549        let encoder = ArmEncoder::new_thumb2();
7550        let op = ArmOp::Lsl {
7551            rd: Reg::R0,
7552            rn: Reg::R1,
7553            shift: 5,
7554        };
7555
7556        let code = encoder.encode(&op).unwrap();
7557        assert_eq!(code.len(), 2); // 16-bit for low registers
7558    }
7559
7560    #[test]
7561    fn test_encode_clz_thumb2() {
7562        let encoder = ArmEncoder::new_thumb2();
7563        let op = ArmOp::Clz {
7564            rd: Reg::R0,
7565            rm: Reg::R1,
7566        };
7567
7568        let code = encoder.encode(&op).unwrap();
7569        assert_eq!(code.len(), 4); // 32-bit Thumb-2 instruction
7570    }
7571
7572    #[test]
7573    fn test_encode_bx_thumb2() {
7574        let encoder = ArmEncoder::new_thumb2();
7575        let op = ArmOp::Bx { rm: Reg::LR };
7576
7577        let code = encoder.encode(&op).unwrap();
7578        assert_eq!(code.len(), 2); // 16-bit instruction
7579
7580        // BX LR: 0x4770
7581        assert_eq!(code, vec![0x70, 0x47]);
7582    }
7583
7584    // ========================================================================
7585    // f32 pseudo-op encoding tests
7586    // ========================================================================
7587
7588    #[test]
7589    fn test_encode_f32_abs_arm32() {
7590        let encoder = ArmEncoder::new_arm32();
7591        let op = ArmOp::F32Abs {
7592            sd: VfpReg::S0,
7593            sm: VfpReg::S2,
7594        };
7595        let code = encoder.encode(&op).unwrap();
7596        assert_eq!(code.len(), 4); // Single VFP instruction
7597    }
7598
7599    #[test]
7600    fn test_encode_f32_neg_arm32() {
7601        let encoder = ArmEncoder::new_arm32();
7602        let op = ArmOp::F32Neg {
7603            sd: VfpReg::S0,
7604            sm: VfpReg::S2,
7605        };
7606        let code = encoder.encode(&op).unwrap();
7607        assert_eq!(code.len(), 4);
7608    }
7609
7610    #[test]
7611    fn test_encode_f32_sqrt_arm32() {
7612        let encoder = ArmEncoder::new_arm32();
7613        let op = ArmOp::F32Sqrt {
7614            sd: VfpReg::S0,
7615            sm: VfpReg::S2,
7616        };
7617        let code = encoder.encode(&op).unwrap();
7618        assert_eq!(code.len(), 4);
7619    }
7620
7621    #[test]
7622    fn test_encode_f32_ceil_arm32() {
7623        let encoder = ArmEncoder::new_arm32();
7624        let op = ArmOp::F32Ceil {
7625            sd: VfpReg::S0,
7626            sm: VfpReg::S2,
7627        };
7628        let code = encoder.encode(&op).unwrap();
7629        // VMRS + BIC + ORR + VMSR + VCVT.S32.F32 + VMRS + BIC + VMSR + VCVT.F32.S32
7630        assert_eq!(code.len(), 36);
7631    }
7632
7633    #[test]
7634    fn test_encode_f32_floor_thumb2() {
7635        let encoder = ArmEncoder::new_thumb2();
7636        let op = ArmOp::F32Floor {
7637            sd: VfpReg::S0,
7638            sm: VfpReg::S2,
7639        };
7640        let code = encoder.encode(&op).unwrap();
7641        // VMRS + BIC.W + ORR.W + VMSR + VCVT + VMRS + BIC.W + VMSR + VCVT.F32.S32
7642        assert_eq!(code.len(), 36);
7643    }
7644
7645    #[test]
7646    fn test_encode_f32_min_arm32() {
7647        let encoder = ArmEncoder::new_arm32();
7648        let op = ArmOp::F32Min {
7649            sd: VfpReg::S0,
7650            sn: VfpReg::S2,
7651            sm: VfpReg::S4,
7652        };
7653        let code = encoder.encode(&op).unwrap();
7654        assert_eq!(code.len(), 16); // VMOV + VCMP + VMRS + conditional VMOV
7655    }
7656
7657    #[test]
7658    fn test_encode_f32_max_thumb2() {
7659        let encoder = ArmEncoder::new_thumb2();
7660        let op = ArmOp::F32Max {
7661            sd: VfpReg::S0,
7662            sn: VfpReg::S2,
7663            sm: VfpReg::S4,
7664        };
7665        let code = encoder.encode(&op).unwrap();
7666        // VMOV(4) + VCMP(4) + VMRS(4) + IT(2) + VMOV(4) = 18
7667        assert_eq!(code.len(), 18);
7668    }
7669
7670    #[test]
7671    fn test_encode_f32_copysign_arm32() {
7672        let encoder = ArmEncoder::new_arm32();
7673        let op = ArmOp::F32Copysign {
7674            sd: VfpReg::S0,
7675            sn: VfpReg::S2,
7676            sm: VfpReg::S4,
7677        };
7678        let code = encoder.encode(&op).unwrap();
7679        // VMOV + VMOV + AND + BIC + ORR + VMOV = 6 * 4 = 24
7680        assert_eq!(code.len(), 24);
7681    }
7682
7683    // ========================================================================
7684    // f64 encoding tests
7685    // ========================================================================
7686
7687    #[test]
7688    fn test_encode_f64_add_arm32() {
7689        let encoder = ArmEncoder::new_arm32();
7690        let op = ArmOp::F64Add {
7691            dd: VfpReg::D0,
7692            dn: VfpReg::D1,
7693            dm: VfpReg::D2,
7694        };
7695        let code = encoder.encode(&op).unwrap();
7696        assert_eq!(code.len(), 4);
7697        // VADD.F64 D0, D1, D2: check coprocessor is cp11 (0xB)
7698        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7699        assert_eq!((instr >> 8) & 0xF, 0xB); // cp11
7700    }
7701
7702    #[test]
7703    fn test_encode_f64_sub_thumb2() {
7704        let encoder = ArmEncoder::new_thumb2();
7705        let op = ArmOp::F64Sub {
7706            dd: VfpReg::D0,
7707            dn: VfpReg::D1,
7708            dm: VfpReg::D2,
7709        };
7710        let code = encoder.encode(&op).unwrap();
7711        assert_eq!(code.len(), 4); // 32-bit VFP as two Thumb halfwords
7712    }
7713
7714    #[test]
7715    fn test_encode_f64_mul_arm32() {
7716        let encoder = ArmEncoder::new_arm32();
7717        let op = ArmOp::F64Mul {
7718            dd: VfpReg::D0,
7719            dn: VfpReg::D1,
7720            dm: VfpReg::D2,
7721        };
7722        let code = encoder.encode(&op).unwrap();
7723        assert_eq!(code.len(), 4);
7724    }
7725
7726    #[test]
7727    fn test_encode_f64_div_arm32() {
7728        let encoder = ArmEncoder::new_arm32();
7729        let op = ArmOp::F64Div {
7730            dd: VfpReg::D0,
7731            dn: VfpReg::D1,
7732            dm: VfpReg::D2,
7733        };
7734        let code = encoder.encode(&op).unwrap();
7735        assert_eq!(code.len(), 4);
7736    }
7737
7738    #[test]
7739    fn test_encode_f64_abs_arm32() {
7740        let encoder = ArmEncoder::new_arm32();
7741        let op = ArmOp::F64Abs {
7742            dd: VfpReg::D0,
7743            dm: VfpReg::D2,
7744        };
7745        let code = encoder.encode(&op).unwrap();
7746        assert_eq!(code.len(), 4);
7747    }
7748
7749    #[test]
7750    fn test_encode_f64_neg_arm32() {
7751        let encoder = ArmEncoder::new_arm32();
7752        let op = ArmOp::F64Neg {
7753            dd: VfpReg::D0,
7754            dm: VfpReg::D2,
7755        };
7756        let code = encoder.encode(&op).unwrap();
7757        assert_eq!(code.len(), 4);
7758    }
7759
7760    #[test]
7761    fn test_encode_f64_sqrt_arm32() {
7762        let encoder = ArmEncoder::new_arm32();
7763        let op = ArmOp::F64Sqrt {
7764            dd: VfpReg::D0,
7765            dm: VfpReg::D2,
7766        };
7767        let code = encoder.encode(&op).unwrap();
7768        assert_eq!(code.len(), 4);
7769    }
7770
7771    #[test]
7772    fn test_encode_f64_load_arm32() {
7773        let encoder = ArmEncoder::new_arm32();
7774        let op = ArmOp::F64Load {
7775            dd: VfpReg::D0,
7776            addr: MemAddr::imm(Reg::R0, 8),
7777        };
7778        let code = encoder.encode(&op).unwrap();
7779        assert_eq!(code.len(), 4);
7780        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7781        assert_eq!((instr >> 8) & 0xF, 0xB); // cp11 for F64
7782        assert_eq!(instr & 0xFF, 2); // offset 8 / 4 = 2
7783    }
7784
7785    #[test]
7786    fn test_encode_f64_store_thumb2() {
7787        let encoder = ArmEncoder::new_thumb2();
7788        let op = ArmOp::F64Store {
7789            dd: VfpReg::D0,
7790            addr: MemAddr::imm(Reg::SP, 0),
7791        };
7792        let code = encoder.encode(&op).unwrap();
7793        assert_eq!(code.len(), 4);
7794    }
7795
7796    #[test]
7797    fn test_encode_f64_compare_arm32() {
7798        let encoder = ArmEncoder::new_arm32();
7799        let op = ArmOp::F64Eq {
7800            rd: Reg::R0,
7801            dn: VfpReg::D0,
7802            dm: VfpReg::D1,
7803        };
7804        let code = encoder.encode(&op).unwrap();
7805        assert_eq!(code.len(), 16); // VCMP + VMRS + MOV #0 + MOVcond #1
7806    }
7807
7808    #[test]
7809    fn test_encode_f64_compare_thumb2() {
7810        let encoder = ArmEncoder::new_thumb2();
7811        let op = ArmOp::F64Lt {
7812            rd: Reg::R0,
7813            dn: VfpReg::D0,
7814            dm: VfpReg::D1,
7815        };
7816        let code = encoder.encode(&op).unwrap();
7817        // VCMP(4) + VMRS(4) + MOVS(2) + IT(2) + MOV(2) = 14
7818        assert_eq!(code.len(), 14);
7819    }
7820
7821    #[test]
7822    fn test_encode_f64_const_arm32() {
7823        let encoder = ArmEncoder::new_arm32();
7824        let op = ArmOp::F64Const {
7825            dd: VfpReg::D0,
7826            value: 3.125,
7827        };
7828        let code = encoder.encode(&op).unwrap();
7829        // MOVW(4) + MOVT(4) + MOVW(4) + MOVT(4) + VMOV(4) = 20
7830        assert_eq!(code.len(), 20);
7831    }
7832
7833    #[test]
7834    fn test_encode_f64_const_thumb2() {
7835        let encoder = ArmEncoder::new_thumb2();
7836        let op = ArmOp::F64Const {
7837            dd: VfpReg::D0,
7838            value: 2.5,
7839        };
7840        let code = encoder.encode(&op).unwrap();
7841        // MOVW(4) + MOVT(4) + MOVW(4) + MOVT(4) + VMOV(4) = 20
7842        assert_eq!(code.len(), 20);
7843    }
7844
7845    #[test]
7846    fn test_encode_f64_convert_i32s_arm32() {
7847        let encoder = ArmEncoder::new_arm32();
7848        let op = ArmOp::F64ConvertI32S {
7849            dd: VfpReg::D0,
7850            rm: Reg::R0,
7851        };
7852        let code = encoder.encode(&op).unwrap();
7853        // VMOV(4) + VCVT(4) = 8
7854        assert_eq!(code.len(), 8);
7855    }
7856
7857    #[test]
7858    fn test_encode_f64_promote_f32_arm32() {
7859        let encoder = ArmEncoder::new_arm32();
7860        let op = ArmOp::F64PromoteF32 {
7861            dd: VfpReg::D0,
7862            sm: VfpReg::S0,
7863        };
7864        let code = encoder.encode(&op).unwrap();
7865        assert_eq!(code.len(), 4); // Single VCVT.F64.F32 instruction
7866    }
7867
7868    #[test]
7869    fn test_encode_f64_promote_f32_thumb2() {
7870        let encoder = ArmEncoder::new_thumb2();
7871        let op = ArmOp::F64PromoteF32 {
7872            dd: VfpReg::D0,
7873            sm: VfpReg::S0,
7874        };
7875        let code = encoder.encode(&op).unwrap();
7876        assert_eq!(code.len(), 4);
7877    }
7878
7879    #[test]
7880    fn test_encode_i32_trunc_f64s_arm32() {
7881        let encoder = ArmEncoder::new_arm32();
7882        let op = ArmOp::I32TruncF64S {
7883            rd: Reg::R0,
7884            dm: VfpReg::D0,
7885        };
7886        let code = encoder.encode(&op).unwrap();
7887        // VCVT(4) + VMOV(4) = 8
7888        assert_eq!(code.len(), 8);
7889    }
7890
7891    #[test]
7892    fn test_encode_f64_reinterpret_i64_arm32() {
7893        let encoder = ArmEncoder::new_arm32();
7894        let op = ArmOp::F64ReinterpretI64 {
7895            dd: VfpReg::D0,
7896            rmlo: Reg::R0,
7897            rmhi: Reg::R1,
7898        };
7899        let code = encoder.encode(&op).unwrap();
7900        assert_eq!(code.len(), 4); // Single VMOV instruction
7901    }
7902
7903    #[test]
7904    fn test_encode_i64_reinterpret_f64_thumb2() {
7905        let encoder = ArmEncoder::new_thumb2();
7906        let op = ArmOp::I64ReinterpretF64 {
7907            rdlo: Reg::R0,
7908            rdhi: Reg::R1,
7909            dm: VfpReg::D0,
7910        };
7911        let code = encoder.encode(&op).unwrap();
7912        assert_eq!(code.len(), 4);
7913    }
7914
7915    #[test]
7916    fn test_encode_f64_trunc_thumb2() {
7917        let encoder = ArmEncoder::new_thumb2();
7918        let op = ArmOp::F64Trunc {
7919            dd: VfpReg::D0,
7920            dm: VfpReg::D1,
7921        };
7922        let code = encoder.encode(&op).unwrap();
7923        // Two VFP instructions via Thumb encoding
7924        assert_eq!(code.len(), 8);
7925    }
7926
7927    #[test]
7928    fn test_encode_f64_min_arm32() {
7929        let encoder = ArmEncoder::new_arm32();
7930        let op = ArmOp::F64Min {
7931            dd: VfpReg::D0,
7932            dn: VfpReg::D1,
7933            dm: VfpReg::D2,
7934        };
7935        let code = encoder.encode(&op).unwrap();
7936        // VMOV + VCMP + VMRS + conditional VMOV = 16
7937        assert_eq!(code.len(), 16);
7938    }
7939
7940    #[test]
7941    fn test_f64_cp11_encoding() {
7942        // Verify that F64 instructions use coprocessor 11 (0xB), not 10 (0xA)
7943        let encoder = ArmEncoder::new_arm32();
7944
7945        // F64Add
7946        let code = encoder
7947            .encode(&ArmOp::F64Add {
7948                dd: VfpReg::D0,
7949                dn: VfpReg::D0,
7950                dm: VfpReg::D0,
7951            })
7952            .unwrap();
7953        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7954        assert_eq!((instr >> 8) & 0xF, 0xB, "F64 should use cp11");
7955
7956        // F32Add for comparison
7957        let code = encoder
7958            .encode(&ArmOp::F32Add {
7959                sd: VfpReg::S0,
7960                sn: VfpReg::S0,
7961                sm: VfpReg::S0,
7962            })
7963            .unwrap();
7964        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7965        assert_eq!((instr >> 8) & 0xF, 0xA, "F32 should use cp10");
7966    }
7967
7968    #[test]
7969    fn test_dreg_encoding_higher_registers() {
7970        let encoder = ArmEncoder::new_arm32();
7971
7972        // Test with D15 (highest register)
7973        let op = ArmOp::F64Add {
7974            dd: VfpReg::D15,
7975            dn: VfpReg::D14,
7976            dm: VfpReg::D13,
7977        };
7978        let code = encoder.encode(&op).unwrap();
7979        assert_eq!(code.len(), 4);
7980
7981        // Verify the register encoding worked (instruction is valid)
7982        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7983        assert_eq!((instr >> 8) & 0xF, 0xB); // cp11
7984    }
7985
7986    // ========================================================================
7987    // Control flow encoding tests
7988    // ========================================================================
7989
7990    #[test]
7991    fn test_encode_label_emits_no_bytes() {
7992        let encoder = ArmEncoder::new_thumb2();
7993        let op = ArmOp::Label {
7994            name: ".Lblock_end_0".to_string(),
7995        };
7996        let code = encoder.encode(&op).unwrap();
7997        assert!(code.is_empty(), "Label should emit zero bytes");
7998
7999        let encoder32 = ArmEncoder::new_arm32();
8000        let code32 = encoder32.encode(&op).unwrap();
8001        assert!(
8002            code32.is_empty(),
8003            "Label should emit zero bytes in ARM32 too"
8004        );
8005    }
8006
8007    #[test]
8008    fn test_encode_bcc_eq_thumb2() {
8009        use synth_synthesis::Condition;
8010        let encoder = ArmEncoder::new_thumb2();
8011        let op = ArmOp::Bcc {
8012            cond: Condition::EQ,
8013            label: "target".to_string(),
8014        };
8015        let code = encoder.encode(&op).unwrap();
8016        assert_eq!(code.len(), 2); // 16-bit conditional branch
8017
8018        // BEQ with offset 0: 0xD000 in little-endian
8019        assert_eq!(code, vec![0x00, 0xD0]);
8020    }
8021
8022    #[test]
8023    fn test_encode_bcc_ne_thumb2() {
8024        use synth_synthesis::Condition;
8025        let encoder = ArmEncoder::new_thumb2();
8026        let op = ArmOp::Bcc {
8027            cond: Condition::NE,
8028            label: "target".to_string(),
8029        };
8030        let code = encoder.encode(&op).unwrap();
8031        assert_eq!(code.len(), 2);
8032
8033        // BNE with offset 0: 0xD100 in little-endian
8034        assert_eq!(code, vec![0x00, 0xD1]);
8035    }
8036
8037    #[test]
8038    fn test_encode_bcc_arm32() {
8039        use synth_synthesis::Condition;
8040        let encoder = ArmEncoder::new_arm32();
8041        let op = ArmOp::Bcc {
8042            cond: Condition::EQ,
8043            label: "target".to_string(),
8044        };
8045        let code = encoder.encode(&op).unwrap();
8046        assert_eq!(code.len(), 4); // 32-bit ARM instruction
8047
8048        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
8049        // BEQ: cond=0x0, opcode=0xA, offset=0
8050        assert_eq!(instr & 0xF0000000, 0x00000000); // EQ condition
8051        assert_eq!(instr & 0x0F000000, 0x0A000000); // Branch opcode
8052    }
8053
8054    #[test]
8055    fn test_encode_udf_thumb2() {
8056        let encoder = ArmEncoder::new_thumb2();
8057        let op = ArmOp::Udf { imm: 0 };
8058        let code = encoder.encode(&op).unwrap();
8059        assert_eq!(code.len(), 2); // 16-bit
8060
8061        // UDF #0: 0xDE00 in little-endian
8062        assert_eq!(code, vec![0x00, 0xDE]);
8063    }
8064
8065    #[test]
8066    fn test_encode_nop_thumb2() {
8067        let encoder = ArmEncoder::new_thumb2();
8068        let op = ArmOp::Nop;
8069        let code = encoder.encode(&op).unwrap();
8070        assert_eq!(code.len(), 2); // 16-bit
8071
8072        // NOP: 0xBF00 in little-endian
8073        assert_eq!(code, vec![0x00, 0xBF]);
8074    }
8075
8076    // =========================================================================
8077    // i64 Thumb-2 encoding tests
8078    // =========================================================================
8079
8080    #[test]
8081    fn test_encode_i64_add_thumb2() {
8082        let encoder = ArmEncoder::new_thumb2();
8083        let op = ArmOp::I64Add {
8084            rdlo: Reg::R0,
8085            rdhi: Reg::R1,
8086            rnlo: Reg::R0,
8087            rnhi: Reg::R1,
8088            rmlo: Reg::R2,
8089            rmhi: Reg::R3,
8090        };
8091        let code = encoder.encode(&op).unwrap();
8092        // Should emit ADDS (2 bytes) + ADC.W (4 bytes) = 6 bytes
8093        assert_eq!(code.len(), 6, "I64Add should be 6 bytes (ADDS + ADC.W)");
8094    }
8095
8096    #[test]
8097    fn test_encode_i64_sub_thumb2() {
8098        let encoder = ArmEncoder::new_thumb2();
8099        let op = ArmOp::I64Sub {
8100            rdlo: Reg::R0,
8101            rdhi: Reg::R1,
8102            rnlo: Reg::R0,
8103            rnhi: Reg::R1,
8104            rmlo: Reg::R2,
8105            rmhi: Reg::R3,
8106        };
8107        let code = encoder.encode(&op).unwrap();
8108        // Should emit SUBS (2 bytes) + SBC.W (4 bytes) = 6 bytes
8109        assert_eq!(code.len(), 6, "I64Sub should be 6 bytes (SUBS + SBC.W)");
8110    }
8111
8112    #[test]
8113    fn test_encode_i64_and_thumb2() {
8114        let encoder = ArmEncoder::new_thumb2();
8115        let op = ArmOp::I64And {
8116            rdlo: Reg::R0,
8117            rdhi: Reg::R1,
8118            rnlo: Reg::R0,
8119            rnhi: Reg::R1,
8120            rmlo: Reg::R2,
8121            rmhi: Reg::R3,
8122        };
8123        let code = encoder.encode(&op).unwrap();
8124        // AND.W (4 bytes) + AND.W (4 bytes) = 8 bytes
8125        assert!(code.len() >= 4, "I64And should emit at least 4 bytes");
8126    }
8127
8128    #[test]
8129    fn test_encode_i64_or_thumb2() {
8130        let encoder = ArmEncoder::new_thumb2();
8131        let op = ArmOp::I64Or {
8132            rdlo: Reg::R0,
8133            rdhi: Reg::R1,
8134            rnlo: Reg::R0,
8135            rnhi: Reg::R1,
8136            rmlo: Reg::R2,
8137            rmhi: Reg::R3,
8138        };
8139        let code = encoder.encode(&op).unwrap();
8140        assert!(code.len() >= 4, "I64Or should emit at least 4 bytes");
8141    }
8142
8143    #[test]
8144    fn test_encode_i64_xor_thumb2() {
8145        let encoder = ArmEncoder::new_thumb2();
8146        let op = ArmOp::I64Xor {
8147            rdlo: Reg::R0,
8148            rdhi: Reg::R1,
8149            rnlo: Reg::R0,
8150            rnhi: Reg::R1,
8151            rmlo: Reg::R2,
8152            rmhi: Reg::R3,
8153        };
8154        let code = encoder.encode(&op).unwrap();
8155        assert!(code.len() >= 4, "I64Xor should emit at least 4 bytes");
8156    }
8157
8158    #[test]
8159    fn test_encode_i64_const_small_thumb2() {
8160        let encoder = ArmEncoder::new_thumb2();
8161        // Small constant: only needs MOVW for each half
8162        let op = ArmOp::I64Const {
8163            rdlo: Reg::R0,
8164            rdhi: Reg::R1,
8165            value: 42,
8166        };
8167        let code = encoder.encode(&op).unwrap();
8168        // MOVW R0, #42 (4 bytes) + MOVW R1, #0 (4 bytes) = 8 bytes minimum
8169        assert!(code.len() >= 8, "I64Const should emit at least 8 bytes");
8170    }
8171
8172    #[test]
8173    fn test_encode_i64_const_large_thumb2() {
8174        let encoder = ArmEncoder::new_thumb2();
8175        // Large constant: needs MOVW+MOVT for each half
8176        let op = ArmOp::I64Const {
8177            rdlo: Reg::R0,
8178            rdhi: Reg::R1,
8179            value: 0x1234_5678_9ABC_DEF0_u64 as i64,
8180        };
8181        let code = encoder.encode(&op).unwrap();
8182        // MOVW + MOVT for lo (8 bytes) + MOVW + MOVT for hi (8 bytes) = 16 bytes
8183        assert_eq!(
8184            code.len(),
8185            16,
8186            "I64Const with large value should be 16 bytes"
8187        );
8188    }
8189
8190    #[test]
8191    fn test_encode_i64_extend_i32_s_thumb2() {
8192        let encoder = ArmEncoder::new_thumb2();
8193        let op = ArmOp::I64ExtendI32S {
8194            rdlo: Reg::R0,
8195            rdhi: Reg::R1,
8196            rn: Reg::R0,
8197        };
8198        let code = encoder.encode(&op).unwrap();
8199        // When rdlo == rn, only ASR (4 bytes) is emitted
8200        assert_eq!(
8201            code.len(),
8202            4,
8203            "I64ExtendI32S (same reg) should be 4 bytes (ASR only)"
8204        );
8205    }
8206
8207    #[test]
8208    fn test_encode_i64_extend_i32_s_diff_reg_thumb2() {
8209        let encoder = ArmEncoder::new_thumb2();
8210        let op = ArmOp::I64ExtendI32S {
8211            rdlo: Reg::R0,
8212            rdhi: Reg::R1,
8213            rn: Reg::R2,
8214        };
8215        let code = encoder.encode(&op).unwrap();
8216        // MOV rdlo, rn (2 bytes for low regs) + ASR rdhi, rdlo, #31 (4 bytes) = 6 bytes
8217        assert!(
8218            code.len() >= 6,
8219            "I64ExtendI32S (diff reg) should be at least 6 bytes"
8220        );
8221    }
8222
8223    #[test]
8224    fn test_encode_i64_extend_i32_u_thumb2() {
8225        let encoder = ArmEncoder::new_thumb2();
8226        let op = ArmOp::I64ExtendI32U {
8227            rdlo: Reg::R0,
8228            rdhi: Reg::R1,
8229            rn: Reg::R0,
8230        };
8231        let code = encoder.encode(&op).unwrap();
8232        // When rdlo == rn, only MOV rdhi, #0 (2 bytes) is emitted
8233        assert_eq!(
8234            code.len(),
8235            2,
8236            "I64ExtendI32U (same reg) should be 2 bytes (MOV #0 only)"
8237        );
8238    }
8239
8240    #[test]
8241    fn test_encode_i32_wrap_i64_nop_thumb2() {
8242        let encoder = ArmEncoder::new_thumb2();
8243        // When rd == rnlo, should be a NOP
8244        let op = ArmOp::I32WrapI64 {
8245            rd: Reg::R0,
8246            rnlo: Reg::R0,
8247        };
8248        let code = encoder.encode(&op).unwrap();
8249        assert_eq!(code.len(), 2, "I32WrapI64 same reg should be NOP (2 bytes)");
8250        assert_eq!(code, vec![0x00, 0xBF]); // NOP
8251    }
8252
8253    #[test]
8254    fn test_encode_i32_wrap_i64_diff_reg_thumb2() {
8255        let encoder = ArmEncoder::new_thumb2();
8256        let op = ArmOp::I32WrapI64 {
8257            rd: Reg::R2,
8258            rnlo: Reg::R0,
8259        };
8260        let code = encoder.encode(&op).unwrap();
8261        // MOV R2, R0 (2 or 4 bytes)
8262        assert!(
8263            code.len() >= 2,
8264            "I32WrapI64 diff reg should emit at least 2 bytes"
8265        );
8266    }
8267
8268    #[test]
8269    fn test_encode_i64_eqz_thumb2() {
8270        let encoder = ArmEncoder::new_thumb2();
8271        let op = ArmOp::I64Eqz {
8272            rd: Reg::R0,
8273            rnlo: Reg::R0,
8274            rnhi: Reg::R1,
8275        };
8276        let code = encoder.encode(&op).unwrap();
8277        // Delegates to I64SetCondZ which is already encoded
8278        assert!(
8279            code.len() >= 6,
8280            "I64Eqz should emit at least 6 bytes for ORR+ITE+MOV+MOV"
8281        );
8282    }
8283
8284    #[test]
8285    fn test_encode_i64_eq_thumb2() {
8286        let encoder = ArmEncoder::new_thumb2();
8287        let op = ArmOp::I64Eq {
8288            rd: Reg::R0,
8289            rnlo: Reg::R0,
8290            rnhi: Reg::R1,
8291            rmlo: Reg::R2,
8292            rmhi: Reg::R3,
8293        };
8294        let code = encoder.encode(&op).unwrap();
8295        // Delegates to I64SetCond EQ: CMP lo + IT EQ + CMPEQ hi + ITE EQ + MOV 1 + MOV 0
8296        assert!(code.len() >= 10, "I64Eq should emit at least 10 bytes");
8297    }
8298
8299    #[test]
8300    fn test_encode_i64_ldr_thumb2() {
8301        let encoder = ArmEncoder::new_thumb2();
8302        let op = ArmOp::I64Ldr {
8303            rdlo: Reg::R0,
8304            rdhi: Reg::R1,
8305            addr: MemAddr::imm(Reg::SP, 0),
8306        };
8307        let code = encoder.encode(&op).unwrap();
8308        // Two LDR instructions (lo at offset, hi at offset+4)
8309        assert!(code.len() >= 4, "I64Ldr should emit at least 4 bytes");
8310    }
8311
8312    #[test]
8313    fn test_encode_i64_str_thumb2() {
8314        let encoder = ArmEncoder::new_thumb2();
8315        let op = ArmOp::I64Str {
8316            rdlo: Reg::R0,
8317            rdhi: Reg::R1,
8318            addr: MemAddr::imm(Reg::SP, 0),
8319        };
8320        let code = encoder.encode(&op).unwrap();
8321        // Two STR instructions (lo at offset, hi at offset+4)
8322        assert!(code.len() >= 4, "I64Str should emit at least 4 bytes");
8323    }
8324
8325    #[test]
8326    fn test_encode_i64_all_comparisons_thumb2() {
8327        let encoder = ArmEncoder::new_thumb2();
8328
8329        let ops = vec![
8330            ArmOp::I64Ne {
8331                rd: Reg::R0,
8332                rnlo: Reg::R0,
8333                rnhi: Reg::R1,
8334                rmlo: Reg::R2,
8335                rmhi: Reg::R3,
8336            },
8337            ArmOp::I64LtS {
8338                rd: Reg::R0,
8339                rnlo: Reg::R0,
8340                rnhi: Reg::R1,
8341                rmlo: Reg::R2,
8342                rmhi: Reg::R3,
8343            },
8344            ArmOp::I64LtU {
8345                rd: Reg::R0,
8346                rnlo: Reg::R0,
8347                rnhi: Reg::R1,
8348                rmlo: Reg::R2,
8349                rmhi: Reg::R3,
8350            },
8351            ArmOp::I64LeS {
8352                rd: Reg::R0,
8353                rnlo: Reg::R0,
8354                rnhi: Reg::R1,
8355                rmlo: Reg::R2,
8356                rmhi: Reg::R3,
8357            },
8358            ArmOp::I64LeU {
8359                rd: Reg::R0,
8360                rnlo: Reg::R0,
8361                rnhi: Reg::R1,
8362                rmlo: Reg::R2,
8363                rmhi: Reg::R3,
8364            },
8365            ArmOp::I64GtS {
8366                rd: Reg::R0,
8367                rnlo: Reg::R0,
8368                rnhi: Reg::R1,
8369                rmlo: Reg::R2,
8370                rmhi: Reg::R3,
8371            },
8372            ArmOp::I64GtU {
8373                rd: Reg::R0,
8374                rnlo: Reg::R0,
8375                rnhi: Reg::R1,
8376                rmlo: Reg::R2,
8377                rmhi: Reg::R3,
8378            },
8379            ArmOp::I64GeS {
8380                rd: Reg::R0,
8381                rnlo: Reg::R0,
8382                rnhi: Reg::R1,
8383                rmlo: Reg::R2,
8384                rmhi: Reg::R3,
8385            },
8386            ArmOp::I64GeU {
8387                rd: Reg::R0,
8388                rnlo: Reg::R0,
8389                rnhi: Reg::R1,
8390                rmlo: Reg::R2,
8391                rmhi: Reg::R3,
8392            },
8393        ];
8394
8395        for op in &ops {
8396            let code = encoder.encode(op).unwrap();
8397            assert!(
8398                code.len() >= 8,
8399                "i64 comparison {:?} should emit at least 8 bytes, got {}",
8400                op,
8401                code.len()
8402            );
8403        }
8404    }
8405
8406    #[test]
8407    fn test_encode_i64_const_zero_thumb2() {
8408        let encoder = ArmEncoder::new_thumb2();
8409        let op = ArmOp::I64Const {
8410            rdlo: Reg::R0,
8411            rdhi: Reg::R1,
8412            value: 0,
8413        };
8414        let code = encoder.encode(&op).unwrap();
8415        // MOVW R0, #0 (4 bytes) + MOVW R1, #0 (4 bytes) = 8 bytes
8416        assert_eq!(code.len(), 8, "I64Const(0) should be 8 bytes");
8417    }
8418
8419    #[test]
8420    fn test_encode_i64_const_negative_one_thumb2() {
8421        let encoder = ArmEncoder::new_thumb2();
8422        let op = ArmOp::I64Const {
8423            rdlo: Reg::R0,
8424            rdhi: Reg::R1,
8425            value: -1, // 0xFFFF_FFFF_FFFF_FFFF
8426        };
8427        let code = encoder.encode(&op).unwrap();
8428        // MOVW + MOVT for lo (8 bytes) + MOVW + MOVT for hi (8 bytes) = 16 bytes
8429        assert_eq!(code.len(), 16, "I64Const(-1) should be 16 bytes");
8430    }
8431
8432    // =========================================================================
8433    // Sub-word load/store encoding tests
8434    // =========================================================================
8435
8436    #[test]
8437    fn test_encode_ldrb_arm32() {
8438        let encoder = ArmEncoder::new_arm32();
8439        let op = ArmOp::Ldrb {
8440            rd: Reg::R0,
8441            addr: MemAddr::imm(Reg::R1, 4),
8442        };
8443        let code = encoder.encode(&op).unwrap();
8444        assert_eq!(code.len(), 4, "ARM32 LDRB should be 4 bytes");
8445        // LDRB R0, [R1, #4] = 0xE5D10004
8446        let encoded = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
8447        assert_eq!(encoded, 0xE5D10004, "Should encode LDRB R0, [R1, #4]");
8448    }
8449
8450    #[test]
8451    fn test_encode_strb_arm32() {
8452        let encoder = ArmEncoder::new_arm32();
8453        let op = ArmOp::Strb {
8454            rd: Reg::R0,
8455            addr: MemAddr::imm(Reg::R1, 0),
8456        };
8457        let code = encoder.encode(&op).unwrap();
8458        assert_eq!(code.len(), 4, "ARM32 STRB should be 4 bytes");
8459        // STRB R0, [R1, #0] = 0xE5C10000
8460        let encoded = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
8461        assert_eq!(encoded, 0xE5C10000, "Should encode STRB R0, [R1, #0]");
8462    }
8463
8464    #[test]
8465    fn test_encode_ldrh_arm32() {
8466        let encoder = ArmEncoder::new_arm32();
8467        let op = ArmOp::Ldrh {
8468            rd: Reg::R0,
8469            addr: MemAddr::imm(Reg::R1, 2),
8470        };
8471        let code = encoder.encode(&op).unwrap();
8472        assert_eq!(code.len(), 4, "ARM32 LDRH should be 4 bytes");
8473    }
8474
8475    #[test]
8476    fn test_encode_strh_arm32() {
8477        let encoder = ArmEncoder::new_arm32();
8478        let op = ArmOp::Strh {
8479            rd: Reg::R0,
8480            addr: MemAddr::imm(Reg::R1, 0),
8481        };
8482        let code = encoder.encode(&op).unwrap();
8483        assert_eq!(code.len(), 4, "ARM32 STRH should be 4 bytes");
8484    }
8485
8486    #[test]
8487    fn test_encode_ldrsb_arm32() {
8488        let encoder = ArmEncoder::new_arm32();
8489        let op = ArmOp::Ldrsb {
8490            rd: Reg::R0,
8491            addr: MemAddr::imm(Reg::R1, 0),
8492        };
8493        let code = encoder.encode(&op).unwrap();
8494        assert_eq!(code.len(), 4, "ARM32 LDRSB should be 4 bytes");
8495    }
8496
8497    #[test]
8498    fn test_encode_ldrsh_arm32() {
8499        let encoder = ArmEncoder::new_arm32();
8500        let op = ArmOp::Ldrsh {
8501            rd: Reg::R0,
8502            addr: MemAddr::imm(Reg::R1, 0),
8503        };
8504        let code = encoder.encode(&op).unwrap();
8505        assert_eq!(code.len(), 4, "ARM32 LDRSH should be 4 bytes");
8506    }
8507
8508    #[test]
8509    fn test_encode_ldrb_thumb2_16bit() {
8510        let encoder = ArmEncoder::new_thumb2();
8511        let op = ArmOp::Ldrb {
8512            rd: Reg::R0,
8513            addr: MemAddr::imm(Reg::R1, 4),
8514        };
8515        let code = encoder.encode(&op).unwrap();
8516        // Low registers + small offset -> 16-bit encoding
8517        assert_eq!(
8518            code.len(),
8519            2,
8520            "Thumb-2 LDRB with small offset should be 16-bit"
8521        );
8522    }
8523
8524    #[test]
8525    fn test_encode_ldrb_thumb2_32bit() {
8526        let encoder = ArmEncoder::new_thumb2();
8527        let op = ArmOp::Ldrb {
8528            rd: Reg::R0,
8529            addr: MemAddr::imm(Reg::R1, 100), // offset > 31 needs 32-bit
8530        };
8531        let code = encoder.encode(&op).unwrap();
8532        assert_eq!(
8533            code.len(),
8534            4,
8535            "Thumb-2 LDRB with large offset should be 32-bit"
8536        );
8537    }
8538
8539    #[test]
8540    fn test_encode_strb_thumb2_16bit() {
8541        let encoder = ArmEncoder::new_thumb2();
8542        let op = ArmOp::Strb {
8543            rd: Reg::R0,
8544            addr: MemAddr::imm(Reg::R1, 10),
8545        };
8546        let code = encoder.encode(&op).unwrap();
8547        assert_eq!(
8548            code.len(),
8549            2,
8550            "Thumb-2 STRB with small offset should be 16-bit"
8551        );
8552    }
8553
8554    #[test]
8555    fn test_encode_ldrh_thumb2_16bit() {
8556        let encoder = ArmEncoder::new_thumb2();
8557        let op = ArmOp::Ldrh {
8558            rd: Reg::R0,
8559            addr: MemAddr::imm(Reg::R1, 4), // offset aligned to 2, <= 62
8560        };
8561        let code = encoder.encode(&op).unwrap();
8562        assert_eq!(
8563            code.len(),
8564            2,
8565            "Thumb-2 LDRH with small aligned offset should be 16-bit"
8566        );
8567    }
8568
8569    #[test]
8570    fn test_encode_strh_thumb2_16bit() {
8571        let encoder = ArmEncoder::new_thumb2();
8572        let op = ArmOp::Strh {
8573            rd: Reg::R0,
8574            addr: MemAddr::imm(Reg::R1, 4),
8575        };
8576        let code = encoder.encode(&op).unwrap();
8577        assert_eq!(
8578            code.len(),
8579            2,
8580            "Thumb-2 STRH with small aligned offset should be 16-bit"
8581        );
8582    }
8583
8584    #[test]
8585    fn test_encode_ldrsb_thumb2() {
8586        let encoder = ArmEncoder::new_thumb2();
8587        let op = ArmOp::Ldrsb {
8588            rd: Reg::R0,
8589            addr: MemAddr::imm(Reg::R1, 0),
8590        };
8591        let code = encoder.encode(&op).unwrap();
8592        // LDRSB has no 16-bit immediate form, always 32-bit
8593        assert_eq!(code.len(), 4, "Thumb-2 LDRSB should be 32-bit");
8594    }
8595
8596    #[test]
8597    fn test_encode_ldrsh_thumb2() {
8598        let encoder = ArmEncoder::new_thumb2();
8599        let op = ArmOp::Ldrsh {
8600            rd: Reg::R0,
8601            addr: MemAddr::imm(Reg::R1, 0),
8602        };
8603        let code = encoder.encode(&op).unwrap();
8604        assert_eq!(code.len(), 4, "Thumb-2 LDRSH should be 32-bit");
8605    }
8606
8607    #[test]
8608    fn test_encode_memory_size_thumb2() {
8609        let encoder = ArmEncoder::new_thumb2();
8610        let op = ArmOp::MemorySize { rd: Reg::R0 };
8611        let code = encoder.encode(&op).unwrap();
8612        // R0 and R10 are not both low registers, so this needs careful handling
8613        assert!(!code.is_empty(), "MemorySize should produce code");
8614    }
8615
8616    #[test]
8617    fn test_encode_memory_grow_thumb2() {
8618        let encoder = ArmEncoder::new_thumb2();
8619        let op = ArmOp::MemoryGrow {
8620            rd: Reg::R0,
8621            rn: Reg::R0,
8622        };
8623        let code = encoder.encode(&op).unwrap();
8624        assert_eq!(code.len(), 4, "MemoryGrow (MVN) should be 32-bit Thumb-2");
8625    }
8626
8627    #[test]
8628    fn test_encode_subword_reg_offset_thumb2() {
8629        let encoder = ArmEncoder::new_thumb2();
8630
8631        // LDRB with register offset
8632        let op = ArmOp::Ldrb {
8633            rd: Reg::R0,
8634            addr: MemAddr::reg(Reg::R1, Reg::R2),
8635        };
8636        let code = encoder.encode(&op).unwrap();
8637        assert_eq!(
8638            code.len(),
8639            4,
8640            "Thumb-2 LDRB with reg offset should be 32-bit"
8641        );
8642
8643        // STRB with register offset
8644        let op = ArmOp::Strb {
8645            rd: Reg::R0,
8646            addr: MemAddr::reg(Reg::R1, Reg::R2),
8647        };
8648        let code = encoder.encode(&op).unwrap();
8649        assert_eq!(
8650            code.len(),
8651            4,
8652            "Thumb-2 STRB with reg offset should be 32-bit"
8653        );
8654
8655        // LDRH with register offset
8656        let op = ArmOp::Ldrh {
8657            rd: Reg::R0,
8658            addr: MemAddr::reg(Reg::R1, Reg::R2),
8659        };
8660        let code = encoder.encode(&op).unwrap();
8661        assert_eq!(
8662            code.len(),
8663            4,
8664            "Thumb-2 LDRH with reg offset should be 32-bit"
8665        );
8666
8667        // STRH with register offset
8668        let op = ArmOp::Strh {
8669            rd: Reg::R0,
8670            addr: MemAddr::reg(Reg::R1, Reg::R2),
8671        };
8672        let code = encoder.encode(&op).unwrap();
8673        assert_eq!(
8674            code.len(),
8675            4,
8676            "Thumb-2 STRH with reg offset should be 32-bit"
8677        );
8678    }
8679
8680    #[test]
8681    fn test_encode_subword_reg_imm_offset_thumb2() {
8682        let encoder = ArmEncoder::new_thumb2();
8683
8684        // LDRB with both register and immediate offset
8685        let op = ArmOp::Ldrb {
8686            rd: Reg::R0,
8687            addr: MemAddr::reg_imm(Reg::R1, Reg::R2, 4),
8688        };
8689        let code = encoder.encode(&op).unwrap();
8690        // ADD R12, R2, #4 (4 bytes) + LDRB R0, [R1, R12] (4 bytes) = 8 bytes
8691        assert_eq!(
8692            code.len(),
8693            8,
8694            "Thumb-2 LDRB with reg+imm offset should be 8 bytes"
8695        );
8696    }
8697
8698    // ========================================================================
8699    // Helium MVE encoding tests
8700    // ========================================================================
8701
8702    #[test]
8703    fn test_encode_mve_addi32_thumb2() {
8704        let encoder = ArmEncoder::new_thumb2();
8705        let op = ArmOp::MveAddI {
8706            qd: QReg::Q0,
8707            qn: QReg::Q1,
8708            qm: QReg::Q2,
8709            size: MveSize::S32,
8710        };
8711        let code = encoder.encode(&op).unwrap();
8712        assert_eq!(
8713            code.len(),
8714            4,
8715            "MVE VADD.I32 should be 4 bytes (Thumb-2 32-bit)"
8716        );
8717    }
8718
8719    #[test]
8720    fn test_encode_mve_subi16_thumb2() {
8721        let encoder = ArmEncoder::new_thumb2();
8722        let op = ArmOp::MveSubI {
8723            qd: QReg::Q0,
8724            qn: QReg::Q1,
8725            qm: QReg::Q2,
8726            size: MveSize::S16,
8727        };
8728        let code = encoder.encode(&op).unwrap();
8729        assert_eq!(code.len(), 4, "MVE VSUB.I16 should be 4 bytes");
8730    }
8731
8732    #[test]
8733    fn test_encode_mve_muli8_thumb2() {
8734        let encoder = ArmEncoder::new_thumb2();
8735        let op = ArmOp::MveMulI {
8736            qd: QReg::Q0,
8737            qn: QReg::Q1,
8738            qm: QReg::Q2,
8739            size: MveSize::S8,
8740        };
8741        let code = encoder.encode(&op).unwrap();
8742        assert_eq!(code.len(), 4, "MVE VMUL.I8 should be 4 bytes");
8743    }
8744
8745    #[test]
8746    fn test_encode_mve_bitwise_thumb2() {
8747        let encoder = ArmEncoder::new_thumb2();
8748
8749        let ops = vec![
8750            ArmOp::MveAnd {
8751                qd: QReg::Q0,
8752                qn: QReg::Q1,
8753                qm: QReg::Q2,
8754            },
8755            ArmOp::MveOrr {
8756                qd: QReg::Q0,
8757                qn: QReg::Q1,
8758                qm: QReg::Q2,
8759            },
8760            ArmOp::MveEor {
8761                qd: QReg::Q0,
8762                qn: QReg::Q1,
8763                qm: QReg::Q2,
8764            },
8765            ArmOp::MveBic {
8766                qd: QReg::Q0,
8767                qn: QReg::Q1,
8768                qm: QReg::Q2,
8769            },
8770        ];
8771        for op in ops {
8772            let code = encoder.encode(&op).unwrap();
8773            assert_eq!(code.len(), 4, "MVE bitwise op should be 4 bytes");
8774        }
8775    }
8776
8777    #[test]
8778    fn test_encode_mve_mvn_thumb2() {
8779        let encoder = ArmEncoder::new_thumb2();
8780        let op = ArmOp::MveMvn {
8781            qd: QReg::Q0,
8782            qm: QReg::Q1,
8783        };
8784        let code = encoder.encode(&op).unwrap();
8785        assert_eq!(code.len(), 4, "MVE VMVN should be 4 bytes");
8786    }
8787
8788    #[test]
8789    fn test_encode_mve_load_store_thumb2() {
8790        let encoder = ArmEncoder::new_thumb2();
8791
8792        let load = ArmOp::MveLoad {
8793            qd: QReg::Q0,
8794            addr: MemAddr::imm(Reg::R0, 16),
8795        };
8796        let code = encoder.encode(&load).unwrap();
8797        assert_eq!(code.len(), 4, "MVE VLDRW.32 should be 4 bytes");
8798
8799        let store = ArmOp::MveStore {
8800            qd: QReg::Q1,
8801            addr: MemAddr::imm(Reg::R1, 0),
8802        };
8803        let code = encoder.encode(&store).unwrap();
8804        assert_eq!(code.len(), 4, "MVE VSTRW.32 should be 4 bytes");
8805    }
8806
8807    #[test]
8808    fn test_encode_mve_const_thumb2() {
8809        let encoder = ArmEncoder::new_thumb2();
8810        let op = ArmOp::MveConst {
8811            qd: QReg::Q0,
8812            bytes: [1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0],
8813        };
8814        let code = encoder.encode(&op).unwrap();
8815        // Should be 4 words of (MOVW R12 + VMOV Sn) = 4 * (4+4) = 32 bytes min
8816        // Some words with hi16=0 skip MOVT, so length varies
8817        assert!(
8818            code.len() >= 24,
8819            "MVE const should produce multiple instructions"
8820        );
8821    }
8822
8823    #[test]
8824    fn test_encode_mve_dup_thumb2() {
8825        let encoder = ArmEncoder::new_thumb2();
8826        let op = ArmOp::MveDup {
8827            qd: QReg::Q0,
8828            rn: Reg::R0,
8829            size: MveSize::S32,
8830        };
8831        let code = encoder.encode(&op).unwrap();
8832        assert_eq!(code.len(), 4, "MVE VDUP.32 should be 4 bytes");
8833    }
8834
8835    #[test]
8836    fn test_encode_mve_extract_lane_thumb2() {
8837        let encoder = ArmEncoder::new_thumb2();
8838        let op = ArmOp::MveExtractLane {
8839            rd: Reg::R0,
8840            qn: QReg::Q1,
8841            lane: 2,
8842            size: MveSize::S32,
8843        };
8844        let code = encoder.encode(&op).unwrap();
8845        assert_eq!(code.len(), 4, "MVE extract lane should be 4 bytes");
8846    }
8847
8848    #[test]
8849    fn test_encode_mve_insert_lane_thumb2() {
8850        let encoder = ArmEncoder::new_thumb2();
8851        let op = ArmOp::MveInsertLane {
8852            qd: QReg::Q0,
8853            rn: Reg::R1,
8854            lane: 3,
8855            size: MveSize::S32,
8856        };
8857        let code = encoder.encode(&op).unwrap();
8858        assert_eq!(code.len(), 4, "MVE insert lane should be 4 bytes");
8859    }
8860
8861    #[test]
8862    fn test_encode_mve_addf32_thumb2() {
8863        let encoder = ArmEncoder::new_thumb2();
8864        let op = ArmOp::MveAddF32 {
8865            qd: QReg::Q0,
8866            qn: QReg::Q1,
8867            qm: QReg::Q2,
8868        };
8869        let code = encoder.encode(&op).unwrap();
8870        assert_eq!(code.len(), 4, "MVE VADD.F32 should be 4 bytes");
8871    }
8872
8873    #[test]
8874    fn test_encode_mve_divf32_thumb2() {
8875        let encoder = ArmEncoder::new_thumb2();
8876        let op = ArmOp::MveDivF32 {
8877            qd: QReg::Q0,
8878            qn: QReg::Q1,
8879            qm: QReg::Q2,
8880        };
8881        let code = encoder.encode(&op).unwrap();
8882        // Lane-wise: 4 x VDIV.F32 = 4 x 4 = 16 bytes
8883        assert_eq!(
8884            code.len(),
8885            16,
8886            "MVE VDIV.F32 (lane-wise) should be 16 bytes"
8887        );
8888    }
8889
8890    #[test]
8891    fn test_encode_mve_sqrtf32_thumb2() {
8892        let encoder = ArmEncoder::new_thumb2();
8893        let op = ArmOp::MveSqrtF32 {
8894            qd: QReg::Q0,
8895            qm: QReg::Q1,
8896        };
8897        let code = encoder.encode(&op).unwrap();
8898        // Lane-wise: 4 x VSQRT.F32 = 4 x 4 = 16 bytes
8899        assert_eq!(
8900            code.len(),
8901            16,
8902            "MVE VSQRT.F32 (lane-wise) should be 16 bytes"
8903        );
8904    }
8905
8906    #[test]
8907    fn test_encode_mve_negf32_thumb2() {
8908        let encoder = ArmEncoder::new_thumb2();
8909        let op = ArmOp::MveNegF32 {
8910            qd: QReg::Q0,
8911            qm: QReg::Q1,
8912        };
8913        let code = encoder.encode(&op).unwrap();
8914        assert_eq!(code.len(), 4, "MVE VNEG.F32 should be 4 bytes");
8915    }
8916
8917    #[test]
8918    fn test_encode_mve_absf32_thumb2() {
8919        let encoder = ArmEncoder::new_thumb2();
8920        let op = ArmOp::MveAbsF32 {
8921            qd: QReg::Q0,
8922            qm: QReg::Q1,
8923        };
8924        let code = encoder.encode(&op).unwrap();
8925        assert_eq!(code.len(), 4, "MVE VABS.F32 should be 4 bytes");
8926    }
8927
8928    #[test]
8929    fn test_encode_mve_different_qregs() {
8930        let encoder = ArmEncoder::new_thumb2();
8931
8932        // Test that different Q-register numbers produce different encodings
8933        let op1 = ArmOp::MveAddI {
8934            qd: QReg::Q0,
8935            qn: QReg::Q0,
8936            qm: QReg::Q0,
8937            size: MveSize::S32,
8938        };
8939        let op2 = ArmOp::MveAddI {
8940            qd: QReg::Q3,
8941            qn: QReg::Q5,
8942            qm: QReg::Q7,
8943            size: MveSize::S32,
8944        };
8945        let code1 = encoder.encode(&op1).unwrap();
8946        let code2 = encoder.encode(&op2).unwrap();
8947        assert_ne!(
8948            code1, code2,
8949            "Different Q-registers should produce different encodings"
8950        );
8951    }
8952
8953    #[test]
8954    fn test_encode_mve_arm32_nop() {
8955        // MVE instructions on ARM32 encoder should produce NOP (only Thumb-2 supported)
8956        let encoder = ArmEncoder::new_arm32();
8957        let op = ArmOp::MveAddI {
8958            qd: QReg::Q0,
8959            qn: QReg::Q1,
8960            qm: QReg::Q2,
8961            size: MveSize::S32,
8962        };
8963        let code = encoder.encode(&op).unwrap();
8964        assert_eq!(code.len(), 4, "ARM32 MVE should be 4 bytes (NOP)");
8965        // NOP in ARM32 is 0xE1A00000 (MOV R0, R0)
8966        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
8967        assert_eq!(instr, 0xE1A00000, "ARM32 MVE should encode as NOP");
8968    }
8969}