Skip to main content

synth_backend/
arm_encoder.rs

1//! ARM Code Encoder - Converts ARM instructions to binary machine code
2//!
3//! Generates ARM32/Thumb-2 machine code from ARM instruction structures
4
5use synth_core::Result;
6use synth_core::target::FPUPrecision;
7use synth_synthesis::contracts::encoding as encoding_contracts;
8use synth_synthesis::{ArmOp, MemAddr, MveSize, Operand2, QReg, Reg, VfpReg};
9
10/// ARM instruction encoding
11pub struct ArmEncoder {
12    /// Use Thumb mode (vs ARM mode)
13    thumb_mode: bool,
14    /// FPU capability for VFP instruction encoding
15    #[allow(dead_code)]
16    fpu: Option<FPUPrecision>,
17}
18
19impl ArmEncoder {
20    /// Create a new ARM encoder in ARM32 mode
21    pub fn new_arm32() -> Self {
22        Self {
23            thumb_mode: false,
24            fpu: None,
25        }
26    }
27
28    /// Create a new ARM encoder in Thumb-2 mode
29    pub fn new_thumb2() -> Self {
30        Self {
31            thumb_mode: true,
32            fpu: None,
33        }
34    }
35
36    /// Create a new Thumb-2 encoder with FPU capability
37    pub fn new_thumb2_with_fpu(fpu: Option<FPUPrecision>) -> Self {
38        Self {
39            thumb_mode: true,
40            fpu,
41        }
42    }
43
44    /// Encode a single ARM instruction to bytes
45    pub fn encode(&self, op: &ArmOp) -> Result<Vec<u8>> {
46        if self.thumb_mode {
47            self.encode_thumb(op)
48        } else {
49            self.encode_arm(op)
50        }
51    }
52
53    /// Encode an ARM instruction in ARM32 mode (32-bit instructions)
54    /// #206: encode an ARM32 (A32) load/store whose address uses a register
55    /// offset (`[rn, rm{, #off}]`). Returns `None` for ops with no register
56    /// offset (the caller falls through to the immediate-form arms). Computes
57    /// `ip = base + rm` then re-encodes the op against `[ip, #off]`, which works
58    /// uniformly for word/byte/halfword/signed forms. IP (R12) is the scratch
59    /// register the selector already treats as clobberable across memory ops.
60    fn encode_arm_reg_offset_mem(&self, op: &ArmOp) -> Result<Option<Vec<u8>>> {
61        use synth_synthesis::Reg;
62        let addr = match op {
63            ArmOp::Ldr { addr, .. }
64            | ArmOp::Str { addr, .. }
65            | ArmOp::Ldrb { addr, .. }
66            | ArmOp::Strb { addr, .. }
67            | ArmOp::Ldrh { addr, .. }
68            | ArmOp::Strh { addr, .. }
69            | ArmOp::Ldrsb { addr, .. }
70            | ArmOp::Ldrsh { addr, .. } => addr,
71            _ => return Ok(None),
72        };
73        let Some(rm) = addr.offset_reg else {
74            return Ok(None);
75        };
76        let ip = Reg::R12;
77        // ADD ip, base, rm  (cond=AL, opcode=ADD, S=0, register operand2)
78        let add: u32 = 0xE0800000
79            | (reg_to_bits(&addr.base) << 16)
80            | (reg_to_bits(&ip) << 12)
81            | reg_to_bits(&rm);
82        let mut bytes = add.to_le_bytes().to_vec();
83        // Re-encode the op against [ip, #off] (immediate form → no offset_reg,
84        // so this recursion hits the immediate arms, not this helper again).
85        let imm_addr = MemAddr::imm(ip, addr.offset);
86        let imm_op = match op {
87            ArmOp::Ldr { rd, .. } => ArmOp::Ldr {
88                rd: *rd,
89                addr: imm_addr,
90            },
91            ArmOp::Str { rd, .. } => ArmOp::Str {
92                rd: *rd,
93                addr: imm_addr,
94            },
95            ArmOp::Ldrb { rd, .. } => ArmOp::Ldrb {
96                rd: *rd,
97                addr: imm_addr,
98            },
99            ArmOp::Strb { rd, .. } => ArmOp::Strb {
100                rd: *rd,
101                addr: imm_addr,
102            },
103            ArmOp::Ldrh { rd, .. } => ArmOp::Ldrh {
104                rd: *rd,
105                addr: imm_addr,
106            },
107            ArmOp::Strh { rd, .. } => ArmOp::Strh {
108                rd: *rd,
109                addr: imm_addr,
110            },
111            ArmOp::Ldrsb { rd, .. } => ArmOp::Ldrsb {
112                rd: *rd,
113                addr: imm_addr,
114            },
115            ArmOp::Ldrsh { rd, .. } => ArmOp::Ldrsh {
116                rd: *rd,
117                addr: imm_addr,
118            },
119            _ => unreachable!(),
120        };
121        bytes.extend(self.encode_arm(&imm_op)?);
122        Ok(Some(bytes))
123    }
124
125    fn encode_arm(&self, op: &ArmOp) -> Result<Vec<u8>> {
126        // #206: ARM32 register-offset loads/stores. `encode_mem_addr` only
127        // returns the 12-bit immediate, so the immediate-form arms below
128        // silently DROP `addr.offset_reg` — a runtime address index vanished,
129        // turning `ldr rd,[rn,rm,#off]` into `ldr rd,[rn,#off]` (the access went
130        // to the wrong address). Compute the effective base into IP and re-encode
131        // against `[ip, #off]`, which is uniform for word/byte/halfword/signed.
132        if let Some(bytes) = self.encode_arm_reg_offset_mem(op)? {
133            return Ok(bytes);
134        }
135        let instr: u32 = match op {
136            // Data processing instructions
137            ArmOp::Add { rd, rn, op2 } => {
138                let rd_bits = reg_to_bits(rd);
139                let rn_bits = reg_to_bits(rn);
140                let (op2_bits, i_flag) = encode_operand2(op2)?;
141
142                // ADD encoding: cond(4) | 00 | I(1) | 0100 | S(1) | Rn(4) | Rd(4) | operand2(12)
143                0xE0800000 // condition=always(E), opcode=ADD(0100), S=0
144                    | (i_flag << 25)
145                    | (rn_bits << 16)
146                    | (rd_bits << 12)
147                    | op2_bits
148            }
149
150            ArmOp::Sub { rd, rn, op2 } => {
151                let rd_bits = reg_to_bits(rd);
152                let rn_bits = reg_to_bits(rn);
153                let (op2_bits, i_flag) = encode_operand2(op2)?;
154
155                // SUB encoding: opcode=0010
156                0xE0400000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
157            }
158
159            // i64 support: ADDS, ADC, SUBS, SBC for ARM32
160            ArmOp::Adds { rd, rn, op2 } => {
161                let rd_bits = reg_to_bits(rd);
162                let rn_bits = reg_to_bits(rn);
163                let (op2_bits, i_flag) = encode_operand2(op2)?;
164
165                // ADDS encoding: opcode=0100, S=1
166                0xE0900000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
167            }
168
169            ArmOp::Adc { rd, rn, op2 } => {
170                let rd_bits = reg_to_bits(rd);
171                let rn_bits = reg_to_bits(rn);
172                let (op2_bits, i_flag) = encode_operand2(op2)?;
173
174                // ADC encoding: opcode=0101
175                0xE0A00000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
176            }
177
178            ArmOp::Subs { rd, rn, op2 } => {
179                let rd_bits = reg_to_bits(rd);
180                let rn_bits = reg_to_bits(rn);
181                let (op2_bits, i_flag) = encode_operand2(op2)?;
182
183                // SUBS encoding: opcode=0010, S=1
184                0xE0500000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
185            }
186
187            ArmOp::Sbc { rd, rn, op2 } => {
188                let rd_bits = reg_to_bits(rd);
189                let rn_bits = reg_to_bits(rn);
190                let (op2_bits, i_flag) = encode_operand2(op2)?;
191
192                // SBC encoding: opcode=0110
193                0xE0C00000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
194            }
195
196            ArmOp::Mul { rd, rn, rm } => {
197                let rd_bits = reg_to_bits(rd);
198                let rn_bits = reg_to_bits(rn);
199                let rm_bits = reg_to_bits(rm);
200
201                // MUL encoding: cond(4) | 000000 | A(1) | S(1) | Rd(4) | Rn(4) | Rs(4) | 1001 | Rm(4)
202                0xE0000090 | (rd_bits << 16) | (rn_bits << 8) | rm_bits
203            }
204
205            ArmOp::Umull { rdlo, rdhi, rn, rm } => {
206                let rdlo_bits = reg_to_bits(rdlo);
207                let rdhi_bits = reg_to_bits(rdhi);
208                let rn_bits = reg_to_bits(rn);
209                let rm_bits = reg_to_bits(rm);
210
211                // UMULL encoding: cond(4) | 0000 1000 | RdHi(4) | RdLo(4) | Rm(4) | 1001 | Rn(4)
212                0xE0800090 | (rdhi_bits << 16) | (rdlo_bits << 12) | (rm_bits << 8) | rn_bits
213            }
214
215            ArmOp::Sdiv { rd, rn, rm } => {
216                let rd_bits = reg_to_bits(rd);
217                let rn_bits = reg_to_bits(rn);
218                let rm_bits = reg_to_bits(rm);
219
220                // SDIV encoding: cond(4) | 01110001 | Rd(4) | 1111 | Rm(4) | 0001 | Rn(4)
221                // ARMv7-M and above
222                0xE710F010 | (rd_bits << 16) | (rm_bits << 8) | rn_bits
223            }
224
225            ArmOp::Udiv { rd, rn, rm } => {
226                let rd_bits = reg_to_bits(rd);
227                let rn_bits = reg_to_bits(rn);
228                let rm_bits = reg_to_bits(rm);
229
230                // UDIV encoding: cond(4) | 01110011 | Rd(4) | 1111 | Rm(4) | 0001 | Rn(4)
231                // ARMv7-M and above
232                0xE730F010 | (rd_bits << 16) | (rm_bits << 8) | rn_bits
233            }
234
235            ArmOp::Mls { rd, rn, rm, ra } => {
236                let rd_bits = reg_to_bits(rd);
237                let rn_bits = reg_to_bits(rn);
238                let rm_bits = reg_to_bits(rm);
239                let ra_bits = reg_to_bits(ra);
240
241                // MLS encoding: cond(4) | 00000110 | Rd(4) | Ra(4) | Rm(4) | 1001 | Rn(4)
242                // Rd = Ra - (Rn * Rm)
243                0xE0600090 | (rd_bits << 16) | (ra_bits << 12) | (rm_bits << 8) | rn_bits
244            }
245
246            ArmOp::Mla { rd, rn, rm, ra } => {
247                let rd_bits = reg_to_bits(rd);
248                let rn_bits = reg_to_bits(rn);
249                let rm_bits = reg_to_bits(rm);
250                let ra_bits = reg_to_bits(ra);
251
252                // MLA encoding: cond(4) | 0000001 S | Rd(4) | Ra(4) | Rm(4) | 1001 | Rn(4)
253                // Rd = Ra + (Rn * Rm). Base 0xE0200090 (S=0).
254                0xE0200090 | (rd_bits << 16) | (ra_bits << 12) | (rm_bits << 8) | rn_bits
255            }
256
257            ArmOp::And { rd, rn, op2 } => {
258                let rd_bits = reg_to_bits(rd);
259                let rn_bits = reg_to_bits(rn);
260                let (op2_bits, i_flag) = encode_operand2(op2)?;
261
262                // AND encoding: opcode=0000
263                0xE0000000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
264            }
265
266            ArmOp::Orr { rd, rn, op2 } => {
267                let rd_bits = reg_to_bits(rd);
268                let rn_bits = reg_to_bits(rn);
269                let (op2_bits, i_flag) = encode_operand2(op2)?;
270
271                // ORR encoding: opcode=1100
272                0xE1800000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
273            }
274
275            ArmOp::Eor { rd, rn, op2 } => {
276                let rd_bits = reg_to_bits(rd);
277                let rn_bits = reg_to_bits(rn);
278                let (op2_bits, i_flag) = encode_operand2(op2)?;
279
280                // EOR encoding: opcode=0001
281                0xE0200000 | (i_flag << 25) | (rn_bits << 16) | (rd_bits << 12) | op2_bits
282            }
283
284            // Shift instructions
285            ArmOp::Lsl { rd, rn, shift } => {
286                let rd_bits = reg_to_bits(rd);
287                let rn_bits = reg_to_bits(rn);
288                let shift_bits = *shift & 0x1F;
289
290                // LSL encoding: MOV with shift
291                0xE1A00000 | (rd_bits << 12) | (shift_bits << 7) | rn_bits
292            }
293
294            ArmOp::Lsr { rd, rn, shift } => {
295                let rd_bits = reg_to_bits(rd);
296                let rn_bits = reg_to_bits(rn);
297                let shift_bits = *shift & 0x1F;
298
299                // LSR encoding
300                0xE1A00020 | (rd_bits << 12) | (shift_bits << 7) | rn_bits
301            }
302
303            ArmOp::Asr { rd, rn, shift } => {
304                let rd_bits = reg_to_bits(rd);
305                let rn_bits = reg_to_bits(rn);
306                let shift_bits = *shift & 0x1F;
307
308                // ASR encoding
309                0xE1A00040 | (rd_bits << 12) | (shift_bits << 7) | rn_bits
310            }
311
312            ArmOp::Ror { rd, rn, shift } => {
313                let rd_bits = reg_to_bits(rd);
314                let rn_bits = reg_to_bits(rn);
315                let shift_bits = *shift & 0x1F;
316
317                // ROR encoding: MOV with ROR shift
318                0xE1A00060 | (rd_bits << 12) | (shift_bits << 7) | rn_bits
319            }
320
321            // Register-based shifts (ARM32)
322            // LSL Rd, Rn, Rm: cond 0001101S 0000 Rd Rs 0001 Rn
323            ArmOp::LslReg { rd, rn, rm } => {
324                let rd_bits = reg_to_bits(rd);
325                let rn_bits = reg_to_bits(rn);
326                let rm_bits = reg_to_bits(rm);
327                0xE1A00010 | (rd_bits << 12) | (rm_bits << 8) | rn_bits
328            }
329            ArmOp::LsrReg { rd, rn, rm } => {
330                let rd_bits = reg_to_bits(rd);
331                let rn_bits = reg_to_bits(rn);
332                let rm_bits = reg_to_bits(rm);
333                0xE1A00030 | (rd_bits << 12) | (rm_bits << 8) | rn_bits
334            }
335            ArmOp::AsrReg { rd, rn, rm } => {
336                let rd_bits = reg_to_bits(rd);
337                let rn_bits = reg_to_bits(rn);
338                let rm_bits = reg_to_bits(rm);
339                0xE1A00050 | (rd_bits << 12) | (rm_bits << 8) | rn_bits
340            }
341            ArmOp::RorReg { rd, rn, rm } => {
342                let rd_bits = reg_to_bits(rd);
343                let rn_bits = reg_to_bits(rn);
344                let rm_bits = reg_to_bits(rm);
345                0xE1A00070 | (rd_bits << 12) | (rm_bits << 8) | rn_bits
346            }
347
348            // RSB (Reverse Subtract): Rd = imm - Rn
349            ArmOp::Rsb { rd, rn, imm } => {
350                let rd_bits = reg_to_bits(rd);
351                let rn_bits = reg_to_bits(rn);
352                // RSB encoding: cond(4) | 00 1 0011 S | Rn(4) | Rd(4) | imm12
353                // Opcode for RSB = 0011, I=1 (immediate), S=0
354                0xE2600000 | (rn_bits << 16) | (rd_bits << 12) | (*imm & 0xFF)
355            }
356
357            // Bit manipulation instructions
358            ArmOp::Clz { rd, rm } => {
359                let rd_bits = reg_to_bits(rd);
360                let rm_bits = reg_to_bits(rm);
361
362                // CLZ encoding: cond(4) | 00010110 | 1111 | Rd(4) | 1111 | 0001 | Rm(4)
363                // ARMv5T and above
364                0xE16F0F10 | (rd_bits << 12) | rm_bits
365            }
366
367            ArmOp::Rbit { rd, rm } => {
368                let rd_bits = reg_to_bits(rd);
369                let rm_bits = reg_to_bits(rm);
370
371                // RBIT encoding: cond(4) | 01101111 | 1111 | Rd(4) | 1111 | 0011 | Rm(4)
372                // ARMv6T2 and above
373                0xE6FF0F30 | (rd_bits << 12) | rm_bits
374            }
375
376            ArmOp::Sxtb { rd, rm } => {
377                let rd_bits = reg_to_bits(rd);
378                let rm_bits = reg_to_bits(rm);
379
380                // SXTB encoding: cond(4) | 01101010 | 1111 | Rd(4) | rotate(2) | 00 | 0111 | Rm(4)
381                // ARMv6 and above. rotate=00 for no rotation
382                0xE6AF0070 | (rd_bits << 12) | rm_bits
383            }
384
385            ArmOp::Sxth { rd, rm } => {
386                let rd_bits = reg_to_bits(rd);
387                let rm_bits = reg_to_bits(rm);
388
389                // SXTH encoding: cond(4) | 01101011 | 1111 | Rd(4) | rotate(2) | 00 | 0111 | Rm(4)
390                // ARMv6 and above. rotate=00 for no rotation
391                0xE6BF0070 | (rd_bits << 12) | rm_bits
392            }
393
394            ArmOp::Uxtb { rd, rm } => {
395                let rd_bits = reg_to_bits(rd);
396                let rm_bits = reg_to_bits(rm);
397                // UXTB encoding: cond | 01101110 1111 Rd rotate 00 0111 Rm (rotate=00)
398                0xE6EF0070 | (rd_bits << 12) | rm_bits
399            }
400
401            ArmOp::Uxth { rd, rm } => {
402                let rd_bits = reg_to_bits(rd);
403                let rm_bits = reg_to_bits(rm);
404                // UXTH encoding: cond | 01101111 1111 Rd rotate 00 0111 Rm (rotate=00)
405                0xE6FF0070 | (rd_bits << 12) | rm_bits
406            }
407
408            // Move instructions
409            ArmOp::Mov { rd, op2 } => {
410                let rd_bits = reg_to_bits(rd);
411                let (op2_bits, i_flag) = encode_operand2(op2)?;
412
413                // MOV encoding: opcode=1101
414                0xE1A00000 | (i_flag << 25) | (rd_bits << 12) | op2_bits
415            }
416
417            ArmOp::Mvn { rd, op2 } => {
418                let rd_bits = reg_to_bits(rd);
419                let (op2_bits, i_flag) = encode_operand2(op2)?;
420
421                // MVN encoding: opcode=1111
422                0xE1E00000 | (i_flag << 25) | (rd_bits << 12) | op2_bits
423            }
424
425            // MOVW - Move Wide (ARM32)
426            // Encoding: cond(4) | 0011 0000 | imm4(4) | Rd(4) | imm12(12)
427            ArmOp::Movw { rd, imm16 } => {
428                let rd_bits = reg_to_bits(rd);
429                let imm4 = ((*imm16 as u32) >> 12) & 0xF;
430                let imm12 = (*imm16 as u32) & 0xFFF;
431                0xE3000000 | (imm4 << 16) | (rd_bits << 12) | imm12
432            }
433
434            // MOVT - Move Top (ARM32)
435            // Encoding: cond(4) | 0011 0100 | imm4(4) | Rd(4) | imm12(12)
436            ArmOp::Movt { rd, imm16 } => {
437                let rd_bits = reg_to_bits(rd);
438                let imm4 = ((*imm16 as u32) >> 12) & 0xF;
439                let imm12 = (*imm16 as u32) & 0xFFF;
440                0xE3400000 | (imm4 << 16) | (rd_bits << 12) | imm12
441            }
442
443            // #237: symbol-relative MOVW/MOVT (ARM mode) — addend in place, the
444            // backend records the MOVW_ABS/MOVT_ABS relocation against `symbol`.
445            ArmOp::MovwSym { rd, addend, .. } => {
446                let rd_bits = reg_to_bits(rd);
447                let v = (*addend as u32) & 0xffff;
448                0xE3000000 | (((v >> 12) & 0xF) << 16) | (rd_bits << 12) | (v & 0xFFF)
449            }
450            ArmOp::MovtSym { rd, addend, .. } => {
451                let rd_bits = reg_to_bits(rd);
452                let v = ((*addend as u32) >> 16) & 0xffff;
453                0xE3400000 | (((v >> 12) & 0xF) << 16) | (rd_bits << 12) | (v & 0xFFF)
454            }
455
456            // #345: LdrSym is the Thumb-2 literal-pool address load. A32 mode is
457            // not used for relocatable native-pointer objects; fail loudly rather
458            // than miscompile if it is ever reached here.
459            ArmOp::LdrSym { .. } => {
460                return Err(synth_core::Error::synthesis(
461                    "LdrSym (literal-pool address load) is Thumb-2-only",
462                ));
463            }
464
465            // Compare
466            ArmOp::Cmp { rn, op2 } => {
467                let rn_bits = reg_to_bits(rn);
468                let (op2_bits, i_flag) = encode_operand2(op2)?;
469
470                // CMP encoding: opcode=1010, S=1
471                0xE1500000 | (i_flag << 25) | (rn_bits << 16) | op2_bits
472            }
473
474            // Compare Negative (CMN) - computes Rn + op2 and sets flags
475            ArmOp::Cmn { rn, op2 } => {
476                let rn_bits = reg_to_bits(rn);
477                let (op2_bits, i_flag) = encode_operand2(op2)?;
478
479                // CMN encoding: opcode=1011, S=1
480                0xE1700000 | (i_flag << 25) | (rn_bits << 16) | op2_bits
481            }
482
483            // Load/Store
484            ArmOp::Ldr { rd, addr } => {
485                let rd_bits = reg_to_bits(rd);
486                let (base_bits, offset_bits) = encode_mem_addr(addr);
487
488                // LDR encoding: cond(4) | 01 | I(1) | P(1) | U(1) | B(1) | W(1) | L(1) | Rn(4) | Rd(4) | offset(12)
489                // P=1 (pre-indexed), U=1 (add offset), L=1 (load)
490                0xE5900000 | (base_bits << 16) | (rd_bits << 12) | offset_bits
491            }
492
493            ArmOp::Str { rd, addr } => {
494                let rd_bits = reg_to_bits(rd);
495                let (base_bits, offset_bits) = encode_mem_addr(addr);
496
497                // STR encoding: L=0 (store)
498                0xE5800000 | (base_bits << 16) | (rd_bits << 12) | offset_bits
499            }
500
501            // Sub-word loads (ARM32 encoding)
502            ArmOp::Ldrb { rd, addr } => {
503                let rd_bits = reg_to_bits(rd);
504                let (base_bits, offset_bits) = encode_mem_addr(addr);
505                // LDRB: LDR with B=1 (byte): cond|01|I|P|U|1|W|L|Rn|Rd|offset
506                0xE5D00000 | (base_bits << 16) | (rd_bits << 12) | offset_bits
507            }
508
509            ArmOp::Ldrsb { rd, addr } => {
510                let rd_bits = reg_to_bits(rd);
511                let (base_bits, offset_bits) = encode_mem_addr(addr);
512                // LDRSB (misc load): cond|000|P|U|1|W|1|Rn|Rd|imm4H|1101|imm4L
513                // Simplified with immediate offset
514                let offset_val = offset_bits & 0xFF;
515                let imm4h = (offset_val >> 4) & 0xF;
516                let imm4l = offset_val & 0xF;
517                0xE1D000D0 | (base_bits << 16) | (rd_bits << 12) | (imm4h << 8) | imm4l
518            }
519
520            ArmOp::Ldrh { rd, addr } => {
521                let rd_bits = reg_to_bits(rd);
522                let (base_bits, offset_bits) = encode_mem_addr(addr);
523                // LDRH (misc load): cond|000|P|U|1|W|1|Rn|Rd|imm4H|1011|imm4L
524                let offset_val = offset_bits & 0xFF;
525                let imm4h = (offset_val >> 4) & 0xF;
526                let imm4l = offset_val & 0xF;
527                0xE1D000B0 | (base_bits << 16) | (rd_bits << 12) | (imm4h << 8) | imm4l
528            }
529
530            ArmOp::Ldrsh { rd, addr } => {
531                let rd_bits = reg_to_bits(rd);
532                let (base_bits, offset_bits) = encode_mem_addr(addr);
533                // LDRSH (misc load): cond|000|P|U|1|W|1|Rn|Rd|imm4H|1111|imm4L
534                let offset_val = offset_bits & 0xFF;
535                let imm4h = (offset_val >> 4) & 0xF;
536                let imm4l = offset_val & 0xF;
537                0xE1D000F0 | (base_bits << 16) | (rd_bits << 12) | (imm4h << 8) | imm4l
538            }
539
540            // Sub-word stores (ARM32 encoding)
541            ArmOp::Strb { rd, addr } => {
542                let rd_bits = reg_to_bits(rd);
543                let (base_bits, offset_bits) = encode_mem_addr(addr);
544                // STRB: STR with B=1 (byte): cond|01|I|P|U|1|W|0|Rn|Rd|offset
545                0xE5C00000 | (base_bits << 16) | (rd_bits << 12) | offset_bits
546            }
547
548            ArmOp::Strh { rd, addr } => {
549                let rd_bits = reg_to_bits(rd);
550                let (base_bits, offset_bits) = encode_mem_addr(addr);
551                // STRH (misc store): cond|000|P|U|1|W|0|Rn|Rd|imm4H|1011|imm4L
552                let offset_val = offset_bits & 0xFF;
553                let imm4h = (offset_val >> 4) & 0xF;
554                let imm4l = offset_val & 0xF;
555                0xE1C000B0 | (base_bits << 16) | (rd_bits << 12) | (imm4h << 8) | imm4l
556            }
557
558            // Memory management (ARM32 encoding)
559            ArmOp::MemorySize { rd } => {
560                let rd_bits = reg_to_bits(rd);
561                // MOV rd, R10, LSR #16  (memory size in bytes / 65536 = pages)
562                // cond|000|1101|S|0000|Rd|shift5|type|0|Rm
563                // LSR #16: shift5=10000, type=01
564                0xE1A00820 | (rd_bits << 12) | 0x0A // Rm=R10, shift=16, LSR
565            }
566
567            ArmOp::MemoryGrow { rd, .. } => {
568                let rd_bits = reg_to_bits(rd);
569                // On embedded, always fail: MOV rd, #-1
570                0xE3E00000 | (rd_bits << 12) // MVN rd, #0 = MOV rd, #-1
571            }
572
573            // Label pseudo-instruction: emits no machine code
574            ArmOp::Label { .. } => {
575                return Ok(Vec::new());
576            }
577
578            // Branch instructions
579            ArmOp::B { label: _ } => {
580                // B encoding: cond(4) | 1010 | offset(24)
581                // Simplified: branch to offset 0 (will be patched by linker/resolver)
582                0xEA000000
583            }
584
585            // Conditional branch to label (generic)
586            ArmOp::Bcc { cond, label: _ } => {
587                use synth_synthesis::Condition;
588                let cond_bits: u32 = match cond {
589                    Condition::EQ => 0x0,
590                    Condition::NE => 0x1,
591                    Condition::HS => 0x2,
592                    Condition::LO => 0x3,
593                    Condition::HI => 0x8,
594                    Condition::LS => 0x9,
595                    Condition::GE => 0xA,
596                    Condition::LT => 0xB,
597                    Condition::GT => 0xC,
598                    Condition::LE => 0xD,
599                };
600                // B<cond> with offset 0 (will be patched)
601                (cond_bits << 28) | 0x0A000000
602            }
603
604            // BHS (Branch if Higher or Same) - used for bounds checking
605            ArmOp::Bhs { label: _ } => {
606                // BHS encoding: cond(2=HS) | 1010 | offset(24)
607                0x2A000000 // BHS with offset 0
608            }
609
610            // BLO (Branch if Lower) - complementary to BHS
611            ArmOp::Blo { label: _ } => {
612                // BLO encoding: cond(3=LO) | 1010 | offset(24)
613                0x3A000000 // BLO with offset 0
614            }
615
616            // Branch with numeric offset (in instructions)
617            // ARM32 B instruction: offset is in instructions, stored as words
618            // The offset is relative to PC+8 (due to ARM pipeline)
619            ArmOp::BOffset { offset } => {
620                // B encoding: cond(4) | 1010 | offset(24)
621                // Offset is signed, in words (4-byte units)
622                // ARM adds PC+8 to the offset, so we need to adjust:
623                // target = PC + 8 + (offset * 4)
624                // For backward branch of N instructions: offset = -(N + 2)
625                // wrapping_sub keeps the encoder total under fuzzing (#186): an
626                // extreme i32::MIN offset would otherwise overflow-panic; for any
627                // real branch offset this is identical to `- 2`.
628                let adjusted_offset = offset.wrapping_sub(2); // Account for PC+8
629                let offset_bits = (adjusted_offset as u32) & 0x00FFFFFF;
630                0xEA000000 | offset_bits
631            }
632
633            // Conditional branch with numeric offset
634            ArmOp::BCondOffset { cond, offset } => {
635                use synth_synthesis::Condition;
636                let cond_bits: u32 = match cond {
637                    Condition::EQ => 0x0,
638                    Condition::NE => 0x1,
639                    Condition::HS => 0x2,
640                    Condition::LO => 0x3,
641                    Condition::HI => 0x8,
642                    Condition::LS => 0x9,
643                    Condition::GE => 0xA,
644                    Condition::LT => 0xB,
645                    Condition::GT => 0xC,
646                    Condition::LE => 0xD,
647                };
648                // B<cond> encoding: cond(4) | 1010 | offset(24)
649                // wrapping_sub: total under fuzzing (#186), identical for real offsets.
650                let adjusted_offset = offset.wrapping_sub(2); // Account for PC+8
651                let offset_bits = (adjusted_offset as u32) & 0x00FFFFFF;
652                (cond_bits << 28) | 0x0A000000 | offset_bits
653            }
654
655            ArmOp::Bl { label: _ } => {
656                // BL encoding: cond(4) | 1011 | offset(24)
657                0xEB000000
658            }
659
660            ArmOp::Bx { rm } => {
661                let rm_bits = reg_to_bits(rm);
662
663                // BX encoding: cond(4) | 000100101111111111110001 | Rm(4)
664                0xE12FFF10 | rm_bits
665            }
666
667            ArmOp::Blx { rm } => {
668                let rm_bits = reg_to_bits(rm);
669
670                // BLX (register) encoding: cond(4) | 000100101111111111110011 | Rm(4)
671                0xE12FFF30 | rm_bits
672            }
673
674            ArmOp::Push { regs } => {
675                // STMDB SP!, {regs} encoding: cond(4) | 100100 | 10 | 1101 | register_list(16)
676                let mut reg_list: u32 = 0;
677                for r in regs {
678                    reg_list |= 1 << reg_to_bits(r);
679                }
680                0xE92D0000 | reg_list
681            }
682
683            ArmOp::Pop { regs } => {
684                // LDMIA SP!, {regs} encoding: cond(4) | 100010 | 11 | 1101 | register_list(16)
685                let mut reg_list: u32 = 0;
686                for r in regs {
687                    reg_list |= 1 << reg_to_bits(r);
688                }
689                0xE8BD0000 | reg_list
690            }
691
692            ArmOp::Nop => {
693                // NOP encoding: MOV R0, R0
694                0xE1A00000
695            }
696
697            ArmOp::Udf { imm } => {
698                // UDF (Undefined) encoding in ARM: 0xE7F000F0 | (imm12_hi << 8) | imm4_lo
699                // We only use imm8, so split into imm4_hi and imm4_lo
700                let imm8 = *imm as u32;
701                0xE7F000F0 | ((imm8 & 0xF0) << 4) | (imm8 & 0x0F)
702            }
703
704            // Pseudo-instructions for verification - encode as NOP
705            // These are used in formal verification but not actual code generation
706            ArmOp::Popcnt { .. } => {
707                // Population count pseudo-instruction
708                // Not a real ARM instruction, would be expanded to actual code
709                0xE1A00000 // NOP for now
710            }
711
712            ArmOp::SetCond { .. } => {
713                // Condition evaluation pseudo-instruction
714                // Not a real ARM instruction, would be expanded to actual code
715                0xE1A00000 // NOP for now
716            }
717
718            ArmOp::SelectMove { .. } => {
719                // Conditional move pseudo-instruction for ARM32
720                // Would use MOV{cond} instruction
721                0xE1A00000 // NOP for now
722            }
723
724            ArmOp::Select { .. } => {
725                // Select pseudo-instruction
726                // Not a real ARM instruction, would be expanded to conditional moves
727                0xE1A00000 // NOP for now
728            }
729
730            ArmOp::LocalGet { .. } => {
731                // Local variable get pseudo-instruction
732                // Not a real ARM instruction, would be expanded to memory access
733                0xE1A00000 // NOP for now
734            }
735
736            ArmOp::LocalSet { .. } => {
737                // Local variable set pseudo-instruction
738                // Not a real ARM instruction, would be expanded to memory access
739                0xE1A00000 // NOP for now
740            }
741
742            ArmOp::LocalTee { .. } => {
743                // Local variable tee pseudo-instruction
744                // Not a real ARM instruction, would be expanded to memory access
745                0xE1A00000 // NOP for now
746            }
747
748            ArmOp::GlobalGet { .. } => {
749                // Global variable get pseudo-instruction
750                // Not a real ARM instruction, would be expanded to memory access
751                0xE1A00000 // NOP for now
752            }
753
754            ArmOp::GlobalSet { .. } => {
755                // Global variable set pseudo-instruction
756                // Not a real ARM instruction, would be expanded to memory access
757                0xE1A00000 // NOP for now
758            }
759
760            ArmOp::BrTable { .. } => {
761                // Branch table pseudo-instruction
762                // Not a real ARM instruction, would be expanded to jump table
763                0xE1A00000 // NOP for now
764            }
765
766            ArmOp::Call { .. } => {
767                // Function call pseudo-instruction
768                // Not a real ARM instruction, would be expanded to BL
769                0xE1A00000 // NOP for now
770            }
771
772            ArmOp::CallIndirect { .. } => {
773                // Indirect function call pseudo-instruction
774                // Not a real ARM instruction, would be expanded to indirect branch
775                0xE1A00000 // NOP for now
776            }
777
778            // i64 pseudo-instructions (Phase 2) - encode as NOP for now
779            // Real compiler would expand these to multi-instruction sequences
780            ArmOp::I64Add { .. } => 0xE1A00000,        // NOP
781            ArmOp::I64Sub { .. } => 0xE1A00000,        // NOP
782            ArmOp::I64DivS { .. } => 0xE1A00000,       // NOP
783            ArmOp::I64DivU { .. } => 0xE1A00000,       // NOP
784            ArmOp::I64RemS { .. } => 0xE1A00000,       // NOP
785            ArmOp::I64RemU { .. } => 0xE1A00000,       // NOP
786            ArmOp::I64Clz { .. } => 0xE1A00000,        // NOP
787            ArmOp::I64Ctz { .. } => 0xE1A00000,        // NOP
788            ArmOp::I64Popcnt { .. } => 0xE1A00000,     // NOP
789            ArmOp::I64And { .. } => 0xE1A00000,        // NOP
790            ArmOp::I64Or { .. } => 0xE1A00000,         // NOP
791            ArmOp::I64Xor { .. } => 0xE1A00000,        // NOP
792            ArmOp::I64Eqz { .. } => 0xE1A00000,        // NOP
793            ArmOp::I64Eq { .. } => 0xE1A00000,         // NOP
794            ArmOp::I64Ne { .. } => 0xE1A00000,         // NOP
795            ArmOp::I64LtS { .. } => 0xE1A00000,        // NOP
796            ArmOp::I64LtU { .. } => 0xE1A00000,        // NOP
797            ArmOp::I64LeS { .. } => 0xE1A00000,        // NOP
798            ArmOp::I64LeU { .. } => 0xE1A00000,        // NOP
799            ArmOp::I64GtS { .. } => 0xE1A00000,        // NOP
800            ArmOp::I64GtU { .. } => 0xE1A00000,        // NOP
801            ArmOp::I64GeS { .. } => 0xE1A00000,        // NOP
802            ArmOp::I64GeU { .. } => 0xE1A00000,        // NOP
803            ArmOp::I64Const { .. } => 0xE1A00000,      // NOP
804            ArmOp::I64Ldr { .. } => 0xE1A00000,        // NOP
805            ArmOp::I64Str { .. } => 0xE1A00000,        // NOP
806            ArmOp::I64ExtendI32S { .. } => 0xE1A00000, // NOP
807            ArmOp::I64ExtendI32U { .. } => 0xE1A00000, // NOP
808            ArmOp::I64Extend8S { .. } => 0xE1A00000,   // NOP (Thumb-2 only)
809            ArmOp::I64Extend16S { .. } => 0xE1A00000,  // NOP (Thumb-2 only)
810            ArmOp::I64Extend32S { .. } => 0xE1A00000,  // NOP (Thumb-2 only)
811            ArmOp::I32WrapI64 { .. } => 0xE1A00000,    // NOP
812
813            // f32 VFP single-precision instructions
814            ArmOp::F32Add { sd, sn, sm } => encode_vfp_3reg(0xEE300A00, sd, sn, sm)?,
815            ArmOp::F32Sub { sd, sn, sm } => encode_vfp_3reg(0xEE300A40, sd, sn, sm)?,
816            ArmOp::F32Mul { sd, sn, sm } => encode_vfp_3reg(0xEE200A00, sd, sn, sm)?,
817            ArmOp::F32Div { sd, sn, sm } => encode_vfp_3reg(0xEE800A00, sd, sn, sm)?,
818            ArmOp::F32Abs { sd, sm } => encode_vfp_2reg(0xEEB00AC0, sd, sm)?,
819            ArmOp::F32Neg { sd, sm } => encode_vfp_2reg(0xEEB10A40, sd, sm)?,
820            ArmOp::F32Sqrt { sd, sm } => encode_vfp_2reg(0xEEB10AC0, sd, sm)?,
821
822            // f32 pseudo-ops — multi-instruction sequences
823            // FPSCR RMode: 00=nearest, 01=+inf(ceil), 10=-inf(floor), 11=zero(trunc)
824            ArmOp::F32Ceil { sd, sm } => {
825                return self.encode_arm_f32_rounding(sd, sm, 0b01); // Round toward +Inf
826            }
827            ArmOp::F32Floor { sd, sm } => {
828                return self.encode_arm_f32_rounding(sd, sm, 0b10); // Round toward -Inf
829            }
830            ArmOp::F32Trunc { sd, sm } => {
831                return self.encode_arm_f32_rounding(sd, sm, 0b11); // VCVT toward zero
832            }
833            ArmOp::F32Nearest { sd, sm } => {
834                return self.encode_arm_f32_rounding(sd, sm, 0b00); // VCVT to nearest
835            }
836            ArmOp::F32Min { sd, sn, sm } => {
837                return self.encode_arm_f32_minmax(sd, sn, sm, true);
838            }
839            ArmOp::F32Max { sd, sn, sm } => {
840                return self.encode_arm_f32_minmax(sd, sn, sm, false);
841            }
842            ArmOp::F32Copysign { sd, sn, sm } => {
843                return self.encode_arm_f32_copysign(sd, sn, sm);
844            }
845
846            // f32 comparisons — multi-instruction: VCMP + VMRS + conditional MOV
847            ArmOp::F32Eq { rd, sn, sm } => {
848                return self.encode_arm_f32_compare(rd, sn, sm, 0x0); // EQ
849            }
850            ArmOp::F32Ne { rd, sn, sm } => {
851                return self.encode_arm_f32_compare(rd, sn, sm, 0x1); // NE
852            }
853            ArmOp::F32Lt { rd, sn, sm } => {
854                return self.encode_arm_f32_compare(rd, sn, sm, 0x4); // MI (less than)
855            }
856            ArmOp::F32Le { rd, sn, sm } => {
857                return self.encode_arm_f32_compare(rd, sn, sm, 0x9); // LS (less or same)
858            }
859            ArmOp::F32Gt { rd, sn, sm } => {
860                return self.encode_arm_f32_compare(rd, sn, sm, 0xC); // GT
861            }
862            ArmOp::F32Ge { rd, sn, sm } => {
863                return self.encode_arm_f32_compare(rd, sn, sm, 0xA); // GE
864            }
865
866            // f32 const — multi-instruction: MOVW + MOVT + VMOV
867            ArmOp::F32Const { sd, value } => {
868                return self.encode_arm_f32_const(sd, *value);
869            }
870
871            ArmOp::F32Load { sd, addr } => encode_vfp_ldst(0xED900A00, sd, addr)?,
872            ArmOp::F32Store { sd, addr } => encode_vfp_ldst(0xED800A00, sd, addr)?,
873
874            // f32 conversions — multi-instruction sequences
875            ArmOp::F32ConvertI32S { sd, rm } => {
876                return self.encode_arm_f32_convert_i32(sd, rm, true);
877            }
878            ArmOp::F32ConvertI32U { sd, rm } => {
879                return self.encode_arm_f32_convert_i32(sd, rm, false);
880            }
881            ArmOp::F32ConvertI64S { .. } | ArmOp::F32ConvertI64U { .. } => {
882                return Err(synth_core::Error::synthesis(
883                    "F32 i64 conversion not supported (requires register pairs on 32-bit ARM)",
884                ));
885            }
886            ArmOp::F32ReinterpretI32 { sd, rm } => encode_vmov_core_sreg(true, sd, rm)?,
887            ArmOp::I32ReinterpretF32 { rd, sm } => encode_vmov_core_sreg(false, sm, rd)?,
888            ArmOp::I32TruncF32S { rd, sm } => {
889                return self.encode_arm_i32_trunc_f32(rd, sm, true);
890            }
891            ArmOp::I32TruncF32U { rd, sm } => {
892                return self.encode_arm_i32_trunc_f32(rd, sm, false);
893            }
894
895            // f64 VFP double-precision instructions (ARM32)
896            // F64 arithmetic: same as F32 but with sz=1 (bit 8 = 1, cp11 = 0xB)
897            ArmOp::F64Add { dd, dn, dm } => encode_vfp_3reg_f64(0xEE300B00, dd, dn, dm)?,
898            ArmOp::F64Sub { dd, dn, dm } => encode_vfp_3reg_f64(0xEE300B40, dd, dn, dm)?,
899            ArmOp::F64Mul { dd, dn, dm } => encode_vfp_3reg_f64(0xEE200B00, dd, dn, dm)?,
900            ArmOp::F64Div { dd, dn, dm } => encode_vfp_3reg_f64(0xEE800B00, dd, dn, dm)?,
901            ArmOp::F64Abs { dd, dm } => encode_vfp_2reg_f64(0xEEB00BC0, dd, dm)?,
902            ArmOp::F64Neg { dd, dm } => encode_vfp_2reg_f64(0xEEB10B40, dd, dm)?,
903            ArmOp::F64Sqrt { dd, dm } => encode_vfp_2reg_f64(0xEEB10BC0, dd, dm)?,
904
905            // f64 pseudo-ops
906            // FPSCR RMode: 00=nearest, 01=+inf(ceil), 10=-inf(floor), 11=zero(trunc)
907            ArmOp::F64Ceil { dd, dm } => {
908                return self.encode_arm_f64_rounding(dd, dm, 0b01);
909            }
910            ArmOp::F64Floor { dd, dm } => {
911                return self.encode_arm_f64_rounding(dd, dm, 0b10);
912            }
913            ArmOp::F64Trunc { dd, dm } => {
914                return self.encode_arm_f64_rounding(dd, dm, 0b11);
915            }
916            ArmOp::F64Nearest { dd, dm } => {
917                return self.encode_arm_f64_rounding(dd, dm, 0b00);
918            }
919            ArmOp::F64Min { dd, dn, dm } => {
920                return self.encode_arm_f64_minmax(dd, dn, dm, true);
921            }
922            ArmOp::F64Max { dd, dn, dm } => {
923                return self.encode_arm_f64_minmax(dd, dn, dm, false);
924            }
925            ArmOp::F64Copysign { dd, dn, dm } => {
926                return self.encode_arm_f64_copysign(dd, dn, dm);
927            }
928
929            // f64 comparisons
930            ArmOp::F64Eq { rd, dn, dm } => {
931                return self.encode_arm_f64_compare(rd, dn, dm, 0x0);
932            }
933            ArmOp::F64Ne { rd, dn, dm } => {
934                return self.encode_arm_f64_compare(rd, dn, dm, 0x1);
935            }
936            ArmOp::F64Lt { rd, dn, dm } => {
937                return self.encode_arm_f64_compare(rd, dn, dm, 0x4);
938            }
939            ArmOp::F64Le { rd, dn, dm } => {
940                return self.encode_arm_f64_compare(rd, dn, dm, 0x9);
941            }
942            ArmOp::F64Gt { rd, dn, dm } => {
943                return self.encode_arm_f64_compare(rd, dn, dm, 0xC);
944            }
945            ArmOp::F64Ge { rd, dn, dm } => {
946                return self.encode_arm_f64_compare(rd, dn, dm, 0xA);
947            }
948
949            ArmOp::F64Const { dd, value } => {
950                return self.encode_arm_f64_const(dd, *value);
951            }
952
953            ArmOp::F64Load { dd, addr } => encode_vfp_ldst_f64(0xED900B00, dd, addr)?,
954            ArmOp::F64Store { dd, addr } => encode_vfp_ldst_f64(0xED800B00, dd, addr)?,
955
956            ArmOp::F64ConvertI32S { dd, rm } => {
957                return self.encode_arm_f64_convert_i32(dd, rm, true);
958            }
959            ArmOp::F64ConvertI32U { dd, rm } => {
960                return self.encode_arm_f64_convert_i32(dd, rm, false);
961            }
962            ArmOp::F64ConvertI64S { .. } | ArmOp::F64ConvertI64U { .. } => {
963                return Err(synth_core::Error::synthesis(
964                    "F64 i64 conversion not supported (requires register pairs on 32-bit ARM)",
965                ));
966            }
967            ArmOp::F64PromoteF32 { dd, sm } => {
968                return self.encode_arm_f64_promote_f32(dd, sm);
969            }
970            ArmOp::F64ReinterpretI64 { dd, rmlo, rmhi } => {
971                encode_vmov_core_dreg(true, dd, rmlo, rmhi)?
972            }
973            ArmOp::I64ReinterpretF64 { rdlo, rdhi, dm } => {
974                encode_vmov_core_dreg(false, dm, rdlo, rdhi)?
975            }
976            ArmOp::I64TruncF64S { .. } | ArmOp::I64TruncF64U { .. } => {
977                return Err(synth_core::Error::synthesis(
978                    "i64 truncation from F64 not supported (requires i64 register pairs on 32-bit ARM)",
979                ));
980            }
981            ArmOp::I32TruncF64S { rd, dm } => {
982                return self.encode_arm_i32_trunc_f64(rd, dm, true);
983            }
984            ArmOp::I32TruncF64U { rd, dm } => {
985                return self.encode_arm_i32_trunc_f64(rd, dm, false);
986            }
987            // Multi-instruction sequences - only meaningful in Thumb-2 mode
988            ArmOp::I64SetCond { .. }
989            | ArmOp::I64SetCondZ { .. }
990            | ArmOp::I64Mul { .. }
991            | ArmOp::I64Shl { .. }
992            | ArmOp::I64ShrS { .. }
993            | ArmOp::I64ShrU { .. }
994            | ArmOp::I64Rotl { .. }
995            | ArmOp::I64Rotr { .. } => 0xE1A00000, // NOP (Thumb-2 only)
996
997            // MVE instructions — Thumb-2 only (Cortex-M55 is always Thumb-2)
998            ArmOp::MveLoad { .. }
999            | ArmOp::MveStore { .. }
1000            | ArmOp::MveConst { .. }
1001            | ArmOp::MveAnd { .. }
1002            | ArmOp::MveOrr { .. }
1003            | ArmOp::MveEor { .. }
1004            | ArmOp::MveMvn { .. }
1005            | ArmOp::MveBic { .. }
1006            | ArmOp::MveAddI { .. }
1007            | ArmOp::MveSubI { .. }
1008            | ArmOp::MveMulI { .. }
1009            | ArmOp::MveNegI { .. }
1010            | ArmOp::MveCmpEqI { .. }
1011            | ArmOp::MveCmpNeI { .. }
1012            | ArmOp::MveCmpLtS { .. }
1013            | ArmOp::MveCmpLtU { .. }
1014            | ArmOp::MveCmpGtS { .. }
1015            | ArmOp::MveCmpGtU { .. }
1016            | ArmOp::MveCmpLeS { .. }
1017            | ArmOp::MveCmpLeU { .. }
1018            | ArmOp::MveCmpGeS { .. }
1019            | ArmOp::MveCmpGeU { .. }
1020            | ArmOp::MveDup { .. }
1021            | ArmOp::MveExtractLane { .. }
1022            | ArmOp::MveInsertLane { .. }
1023            | ArmOp::MveAddF32 { .. }
1024            | ArmOp::MveSubF32 { .. }
1025            | ArmOp::MveMulF32 { .. }
1026            | ArmOp::MveNegF32 { .. }
1027            | ArmOp::MveAbsF32 { .. }
1028            | ArmOp::MveCmpEqF32 { .. }
1029            | ArmOp::MveCmpNeF32 { .. }
1030            | ArmOp::MveCmpLtF32 { .. }
1031            | ArmOp::MveCmpLeF32 { .. }
1032            | ArmOp::MveCmpGtF32 { .. }
1033            | ArmOp::MveCmpGeF32 { .. }
1034            | ArmOp::MveDupF32 { .. }
1035            | ArmOp::MveExtractLaneF32 { .. }
1036            | ArmOp::MveReplaceLaneF32 { .. }
1037            | ArmOp::MveDivF32 { .. }
1038            | ArmOp::MveSqrtF32 { .. } => 0xE1A00000, // NOP (MVE = Thumb-2 only)
1039        };
1040
1041        // ARM32 instructions are little-endian
1042        Ok(instr.to_le_bytes().to_vec())
1043    }
1044
1045    // === ARM32 VFP multi-instruction helpers ===
1046
1047    /// Encode F32 comparison as ARM32: VCMP.F32 + VMRS + MOV rd,#0 + MOVcond rd,#1
1048    fn encode_arm_f32_compare(
1049        &self,
1050        rd: &Reg,
1051        sn: &VfpReg,
1052        sm: &VfpReg,
1053        cond_code: u32,
1054    ) -> Result<Vec<u8>> {
1055        let mut bytes = Vec::new();
1056
1057        // VCMP.F32 Sn, Sm: 0xEEB40A40 with Sn in Vd position, Sm in Vm position
1058        let sn_num = vfp_sreg_to_num(sn)?;
1059        let sm_num = vfp_sreg_to_num(sm)?;
1060        let (vd, d) = encode_sreg(sn_num);
1061        let (vm, m) = encode_sreg(sm_num);
1062        let vcmp = 0xEEB40A40 | (d << 22) | (vd << 12) | (m << 5) | vm;
1063        bytes.extend_from_slice(&vcmp.to_le_bytes());
1064
1065        // VMRS APSR_nzcv, FPSCR: 0xEEF1FA10
1066        bytes.extend_from_slice(&0xEEF1FA10u32.to_le_bytes());
1067
1068        // MOV rd, #0: 0xE3A0_0000 | (rd << 12)
1069        let rd_bits = reg_to_bits(rd);
1070        let mov_zero = 0xE3A00000 | (rd_bits << 12);
1071        bytes.extend_from_slice(&mov_zero.to_le_bytes());
1072
1073        // MOVcond rd, #1: cond(4) | 0011 1010 0000 rd(4) 0000 0000 0001
1074        let mov_one = (cond_code << 28) | 0x03A00001 | (rd_bits << 12);
1075        bytes.extend_from_slice(&mov_one.to_le_bytes());
1076
1077        Ok(bytes)
1078    }
1079
1080    /// Encode F32 constant load as ARM32: MOVW Rt,#lo16 + MOVT Rt,#hi16 + VMOV Sd,Rt
1081    fn encode_arm_f32_const(&self, sd: &VfpReg, value: f32) -> Result<Vec<u8>> {
1082        let mut bytes = Vec::new();
1083        let bits = value.to_bits();
1084
1085        // Use R12 as temp register for constant loading
1086        let rt: u32 = 12; // R12/IP
1087
1088        // MOVW R12, #lo16: 0xE300_C000 | (imm4 << 16) | imm12
1089        let lo16 = bits & 0xFFFF;
1090        let movw = 0xE3000000 | (rt << 12) | ((lo16 >> 12) << 16) | (lo16 & 0xFFF);
1091        bytes.extend_from_slice(&movw.to_le_bytes());
1092
1093        // MOVT R12, #hi16: 0xE340_C000 | (imm4 << 16) | imm12
1094        let hi16 = (bits >> 16) & 0xFFFF;
1095        let movt = 0xE3400000 | (rt << 12) | ((hi16 >> 12) << 16) | (hi16 & 0xFFF);
1096        bytes.extend_from_slice(&movt.to_le_bytes());
1097
1098        // VMOV Sd, R12
1099        let vmov = encode_vmov_core_sreg(true, sd, &Reg::R12)?;
1100        bytes.extend_from_slice(&vmov.to_le_bytes());
1101
1102        Ok(bytes)
1103    }
1104
1105    /// Encode VMOV + VCVT.F32.S32/U32 as ARM32
1106    fn encode_arm_f32_convert_i32(&self, sd: &VfpReg, rm: &Reg, signed: bool) -> Result<Vec<u8>> {
1107        let mut bytes = Vec::new();
1108
1109        // VMOV Sd, Rm — move integer to VFP register
1110        let vmov = encode_vmov_core_sreg(true, sd, rm)?;
1111        bytes.extend_from_slice(&vmov.to_le_bytes());
1112
1113        // VCVT.F32.S32 Sd, Sd (signed) or VCVT.F32.U32 Sd, Sd (unsigned)
1114        // Base: 0xEEB80A40 (signed) or 0xEEB80AC0 (unsigned)
1115        let sd_num = vfp_sreg_to_num(sd)?;
1116        let (vd, d) = encode_sreg(sd_num);
1117        let (vm, m) = encode_sreg(sd_num); // same register as source
1118        let base = if signed { 0xEEB80A40 } else { 0xEEB80AC0 };
1119        let vcvt = base | (d << 22) | (vd << 12) | (m << 5) | vm;
1120        bytes.extend_from_slice(&vcvt.to_le_bytes());
1121
1122        Ok(bytes)
1123    }
1124
1125    /// Encode F32 rounding pseudo-op as ARM32 via VCVT to integer and back.
1126    /// mode: 0b00=nearest, 0b01=floor(-Inf), 0b10=ceil(+Inf), 0b11=trunc(zero)
1127    /// Strategy: VCVT.S32.F32 Sd, Sm (toward zero), then VCVT.F32.S32 Sd, Sd
1128    /// For ceil/floor/nearest, we use VCVTR (round toward mode) + convert back.
1129    /// Simplified: convert to int (toward zero for trunc) then back to float.
1130    /// Encode F32 rounding as ARM32.
1131    /// `mode`: FPSCR RMode — 0b00=nearest, 0b01=+inf(ceil), 0b10=-inf(floor), 0b11=zero(trunc)
1132    ///
1133    /// For trunc (mode=0b11): uses VCVTR.S32.F32 (always rounds toward zero).
1134    /// For ceil/floor/nearest: sets FPSCR rounding mode, uses VCVT.S32.F32 (non-R variant
1135    /// which honours FPSCR rmode), then restores FPSCR.
1136    fn encode_arm_f32_rounding(&self, sd: &VfpReg, sm: &VfpReg, mode: u8) -> Result<Vec<u8>> {
1137        let mut bytes = Vec::new();
1138        let sm_num = vfp_sreg_to_num(sm)?;
1139        let sd_num = vfp_sreg_to_num(sd)?;
1140        let (vd_s, d_s) = encode_sreg(sd_num);
1141        let (vm_s, m_s) = encode_sreg(sm_num);
1142
1143        if mode == 0b11 {
1144            // Trunc (toward zero): VCVTR.S32.F32 — the "R" variant always truncates.
1145            // 0xEEBD0AC0: bit[7]=1 => round toward zero regardless of FPSCR
1146            let vcvt_to_int = 0xEEBD0AC0 | (d_s << 22) | (vd_s << 12) | (m_s << 5) | vm_s;
1147            bytes.extend_from_slice(&vcvt_to_int.to_le_bytes());
1148        } else {
1149            // ceil/floor/nearest: manipulate FPSCR rounding mode
1150            let rt: u32 = 12; // R12/IP as temp
1151
1152            // VMRS R12, FPSCR
1153            let vmrs = 0xEEF10A10 | (rt << 12);
1154            bytes.extend_from_slice(&vmrs.to_le_bytes());
1155
1156            // BIC R12, R12, #(3 << 22) — clear RMode bits [23:22]
1157            // 3<<22 = 0x00C00000. ARM rotated imm: 0x03 ror 10 (rotation=5, imm8=0x03)
1158            let bic = 0xE3CC0000 | (rt << 12) | (0x05 << 8) | 0x03;
1159            bytes.extend_from_slice(&bic.to_le_bytes());
1160
1161            // ORR R12, R12, #(mode << 22) — set desired rounding mode
1162            if mode != 0 {
1163                // mode<<22: rotation=5, imm8=mode
1164                let orr = 0xE38C0000 | (rt << 12) | (0x05 << 8) | (mode as u32);
1165                bytes.extend_from_slice(&orr.to_le_bytes());
1166            }
1167
1168            // VMSR FPSCR, R12
1169            let vmsr = 0xEEE10A10 | (rt << 12);
1170            bytes.extend_from_slice(&vmsr.to_le_bytes());
1171
1172            // VCVT.S32.F32 Sd, Sm — non-R variant (bit[7]=0), uses FPSCR rounding mode
1173            let vcvt_to_int = 0xEEBD0A40 | (d_s << 22) | (vd_s << 12) | (m_s << 5) | vm_s;
1174            bytes.extend_from_slice(&vcvt_to_int.to_le_bytes());
1175
1176            // Restore FPSCR: clear rmode bits back to nearest (default)
1177            bytes.extend_from_slice(&vmrs.to_le_bytes());
1178            bytes.extend_from_slice(&bic.to_le_bytes());
1179            bytes.extend_from_slice(&vmsr.to_le_bytes());
1180        }
1181
1182        // VCVT.F32.S32 Sd, Sd (convert integer result back to float)
1183        let (vd2, d2) = encode_sreg(sd_num);
1184        let vcvt_to_float = 0xEEB80A40 | (d2 << 22) | (vd2 << 12) | (d_s << 5) | vd_s;
1185        bytes.extend_from_slice(&vcvt_to_float.to_le_bytes());
1186
1187        Ok(bytes)
1188    }
1189
1190    /// Encode F32 min/max as ARM32: VCMP + VMRS + conditional VMOV
1191    fn encode_arm_f32_minmax(
1192        &self,
1193        sd: &VfpReg,
1194        sn: &VfpReg,
1195        sm: &VfpReg,
1196        is_min: bool,
1197    ) -> Result<Vec<u8>> {
1198        let mut bytes = Vec::new();
1199        let sn_num = vfp_sreg_to_num(sn)?;
1200        let sm_num = vfp_sreg_to_num(sm)?;
1201        let sd_num = vfp_sreg_to_num(sd)?;
1202
1203        // VMOV Sd, Sn (start with first operand)
1204        let (vd, d) = encode_sreg(sd_num);
1205        let (vn, n) = encode_sreg(sn_num);
1206        let vmov_sn = 0xEEB00A40 | (d << 22) | (vd << 12) | (n << 5) | vn;
1207        bytes.extend_from_slice(&vmov_sn.to_le_bytes());
1208
1209        // VCMP.F32 Sn, Sm
1210        let (vm, m) = encode_sreg(sm_num);
1211        let vcmp = 0xEEB40A40 | (n << 22) | (vn << 12) | (m << 5) | vm;
1212        bytes.extend_from_slice(&vcmp.to_le_bytes());
1213
1214        // VMRS APSR_nzcv, FPSCR
1215        bytes.extend_from_slice(&0xEEF1FA10u32.to_le_bytes());
1216
1217        // For min: if Sn > Sm (GT), use Sm. Condition = GT (0xC)
1218        // For max: if Sn < Sm (MI/LT), use Sm. Condition = MI (0x4)
1219        let cond = if is_min { 0xCu32 } else { 0x4u32 };
1220
1221        // VMOV{cond} Sd, Sm — conditional VMOV
1222        let vmov_cond = (cond << 28) | 0x0EB00A40 | (d << 22) | (vd << 12) | (m << 5) | vm;
1223        bytes.extend_from_slice(&vmov_cond.to_le_bytes());
1224
1225        Ok(bytes)
1226    }
1227
1228    /// Encode F32 copysign as ARM32: extract sign from Sm, magnitude from Sn
1229    fn encode_arm_f32_copysign(&self, sd: &VfpReg, sn: &VfpReg, sm: &VfpReg) -> Result<Vec<u8>> {
1230        let mut bytes = Vec::new();
1231
1232        // VMOV R12, Sm (get sign source bits)
1233        let vmov_sm = encode_vmov_core_sreg(false, sm, &Reg::R12)?;
1234        bytes.extend_from_slice(&vmov_sm.to_le_bytes());
1235
1236        // VMOV R0, Sn (get magnitude source bits) — use R0 as temp
1237        let vmov_sn = encode_vmov_core_sreg(false, sn, &Reg::R0)?;
1238        bytes.extend_from_slice(&vmov_sn.to_le_bytes());
1239
1240        // AND R12, R12, #0x80000000 (keep only sign bit)
1241        // Thumb-2 constant 0x80000000 needs special encoding; in ARM32 use rotated imm
1242        // 0x80000000 = 0x02 rotated right by 2 (rotation=1, imm8=0x02)
1243        let and_sign = 0xE2000000u32 | (12 << 16) | (12 << 12) | (1 << 8) | 0x02;
1244        bytes.extend_from_slice(&and_sign.to_le_bytes());
1245
1246        // BIC R0, R0, #0x80000000 (clear sign bit from magnitude)
1247        // R0 = register 0, so Rn and Rd fields are 0
1248        let bic_sign = 0xE3C00000u32 | (1 << 8) | 0x02;
1249        bytes.extend_from_slice(&bic_sign.to_le_bytes());
1250
1251        // ORR R0, R0, R12 (combine sign + magnitude)
1252        // R0 = register 0, so Rn and Rd fields are 0
1253        let orr = 0xE1800000u32 | 12;
1254        bytes.extend_from_slice(&orr.to_le_bytes());
1255
1256        // VMOV Sd, R0
1257        let vmov_result = encode_vmov_core_sreg(true, sd, &Reg::R0)?;
1258        bytes.extend_from_slice(&vmov_result.to_le_bytes());
1259
1260        Ok(bytes)
1261    }
1262
1263    /// Encode F64 comparison as ARM32: VCMP.F64 + VMRS + MOV rd,#0 + MOVcond rd,#1
1264    fn encode_arm_f64_compare(
1265        &self,
1266        rd: &Reg,
1267        dn: &VfpReg,
1268        dm: &VfpReg,
1269        cond_code: u32,
1270    ) -> Result<Vec<u8>> {
1271        let mut bytes = Vec::new();
1272
1273        // VCMP.F64 Dn, Dm: 0xEEB40B40 with Dn in Vd position, Dm in Vm position
1274        let dn_num = vfp_dreg_to_num(dn)?;
1275        let dm_num = vfp_dreg_to_num(dm)?;
1276        let (vd, d) = encode_dreg(dn_num);
1277        let (vm, m) = encode_dreg(dm_num);
1278        let vcmp = 0xEEB40B40 | (d << 22) | (vd << 12) | (m << 5) | vm;
1279        bytes.extend_from_slice(&vcmp.to_le_bytes());
1280
1281        // VMRS APSR_nzcv, FPSCR
1282        bytes.extend_from_slice(&0xEEF1FA10u32.to_le_bytes());
1283
1284        // MOV rd, #0
1285        let rd_bits = reg_to_bits(rd);
1286        let mov_zero = 0xE3A00000 | (rd_bits << 12);
1287        bytes.extend_from_slice(&mov_zero.to_le_bytes());
1288
1289        // MOVcond rd, #1
1290        let mov_one = (cond_code << 28) | 0x03A00001 | (rd_bits << 12);
1291        bytes.extend_from_slice(&mov_one.to_le_bytes());
1292
1293        Ok(bytes)
1294    }
1295
1296    /// Encode F64 constant load as ARM32: MOVW + MOVT + MOVW + MOVT + VMOV
1297    fn encode_arm_f64_const(&self, dd: &VfpReg, value: f64) -> Result<Vec<u8>> {
1298        let mut bytes = Vec::new();
1299        let bits = value.to_bits();
1300        let lo32 = bits as u32;
1301        let hi32 = (bits >> 32) as u32;
1302
1303        // Load low 32 bits into R0 (Rd field = 0 for R0)
1304        let lo16 = lo32 & 0xFFFF;
1305        let movw_r0 = 0xE3000000 | ((lo16 >> 12) << 16) | (lo16 & 0xFFF);
1306        bytes.extend_from_slice(&movw_r0.to_le_bytes());
1307        let hi16 = (lo32 >> 16) & 0xFFFF;
1308        let movt_r0 = 0xE3400000 | ((hi16 >> 12) << 16) | (hi16 & 0xFFF);
1309        bytes.extend_from_slice(&movt_r0.to_le_bytes());
1310
1311        // Load high 32 bits into R12
1312        let lo16 = hi32 & 0xFFFF;
1313        let movw_r12 = 0xE3000000 | ((lo16 >> 12) << 16) | (12 << 12) | (lo16 & 0xFFF);
1314        bytes.extend_from_slice(&movw_r12.to_le_bytes());
1315        let hi16 = (hi32 >> 16) & 0xFFFF;
1316        let movt_r12 = 0xE3400000 | ((hi16 >> 12) << 16) | (12 << 12) | (hi16 & 0xFFF);
1317        bytes.extend_from_slice(&movt_r12.to_le_bytes());
1318
1319        // VMOV Dd, R0, R12
1320        let vmov = encode_vmov_core_dreg(true, dd, &Reg::R0, &Reg::R12)?;
1321        bytes.extend_from_slice(&vmov.to_le_bytes());
1322
1323        Ok(bytes)
1324    }
1325
1326    /// Encode VMOV Sd, Rm + VCVT.F64.S32/U32 Dd, Sd as ARM32
1327    fn encode_arm_f64_convert_i32(&self, dd: &VfpReg, rm: &Reg, signed: bool) -> Result<Vec<u8>> {
1328        let mut bytes = Vec::new();
1329
1330        // Use S0 as intermediate: VMOV S0, Rm
1331        let vmov = encode_vmov_core_sreg(true, &VfpReg::S0, rm)?;
1332        bytes.extend_from_slice(&vmov.to_le_bytes());
1333
1334        // VCVT.F64.S32 Dd, S0 (signed) or VCVT.F64.U32 Dd, S0 (unsigned)
1335        // Base: 0xEEB80B40 (signed) or 0xEEB80BC0 (unsigned)
1336        let dd_num = vfp_dreg_to_num(dd)?;
1337        let (vd, d) = encode_dreg(dd_num);
1338        let base = if signed { 0xEEB80B40 } else { 0xEEB80BC0 };
1339        // S0 is register 0: Vm=0, M=0
1340        let vcvt = base | (d << 22) | (vd << 12);
1341        bytes.extend_from_slice(&vcvt.to_le_bytes());
1342
1343        Ok(bytes)
1344    }
1345
1346    /// Encode VCVT.F64.F32 Dd, Sm as ARM32 (f32 to f64 promotion)
1347    fn encode_arm_f64_promote_f32(&self, dd: &VfpReg, sm: &VfpReg) -> Result<Vec<u8>> {
1348        let dd_num = vfp_dreg_to_num(dd)?;
1349        let sm_num = vfp_sreg_to_num(sm)?;
1350        let (vd, d) = encode_dreg(dd_num);
1351        let (vm, m) = encode_sreg(sm_num);
1352
1353        // VCVT.F64.F32 Dd, Sm: 0xEEB70AC0
1354        let vcvt = 0xEEB70AC0 | (d << 22) | (vd << 12) | (m << 5) | vm;
1355        Ok(vcvt.to_le_bytes().to_vec())
1356    }
1357
1358    /// Encode VCVT.S32/U32.F64 Sd, Dm + VMOV Rd, Sd as ARM32
1359    fn encode_arm_i32_trunc_f64(&self, rd: &Reg, dm: &VfpReg, signed: bool) -> Result<Vec<u8>> {
1360        let mut bytes = Vec::new();
1361        let dm_num = vfp_dreg_to_num(dm)?;
1362        let (vm, m) = encode_dreg(dm_num);
1363
1364        // VCVT.S32.F64 S0, Dm (toward zero) or VCVT.U32.F64 S0, Dm
1365        // S0: Vd=0, D=0
1366        let base = if signed { 0xEEBD0BC0 } else { 0xEEBC0BC0 };
1367        let vcvt = base | (m << 5) | vm;
1368        bytes.extend_from_slice(&vcvt.to_le_bytes());
1369
1370        // VMOV Rd, S0
1371        let vmov = encode_vmov_core_sreg(false, &VfpReg::S0, rd)?;
1372        bytes.extend_from_slice(&vmov.to_le_bytes());
1373
1374        Ok(bytes)
1375    }
1376
1377    /// Encode F64 rounding pseudo-op as ARM32 via VCVT to integer and back.
1378    /// Encode F64 rounding as ARM32.
1379    /// `mode`: FPSCR RMode — 0b00=nearest, 0b01=+inf(ceil), 0b10=-inf(floor), 0b11=zero(trunc)
1380    ///
1381    /// For trunc: uses VCVTR.S32.F64 (always truncates).
1382    /// For ceil/floor/nearest: sets FPSCR rounding mode, uses VCVT.S32.F64 (non-R variant),
1383    /// then restores FPSCR.
1384    fn encode_arm_f64_rounding(&self, dd: &VfpReg, dm: &VfpReg, mode: u8) -> Result<Vec<u8>> {
1385        let mut bytes = Vec::new();
1386        let dm_num = vfp_dreg_to_num(dm)?;
1387        let dd_num = vfp_dreg_to_num(dd)?;
1388        let (vm, m) = encode_dreg(dm_num);
1389        let (vd, d) = encode_dreg(dd_num);
1390
1391        if mode == 0b11 {
1392            // Trunc (toward zero): VCVTR.S32.F64 — bit[7]=1, always truncates
1393            let vcvt_to_int = 0xEEBD0BC0 | (m << 5) | vm;
1394            bytes.extend_from_slice(&vcvt_to_int.to_le_bytes());
1395        } else {
1396            // ceil/floor/nearest: manipulate FPSCR rounding mode
1397            let rt: u32 = 12;
1398
1399            // VMRS R12, FPSCR
1400            let vmrs = 0xEEF10A10 | (rt << 12);
1401            bytes.extend_from_slice(&vmrs.to_le_bytes());
1402
1403            // BIC R12, R12, #(3 << 22)
1404            let bic = 0xE3CC0000 | (rt << 12) | (0x05 << 8) | 0x03;
1405            bytes.extend_from_slice(&bic.to_le_bytes());
1406
1407            // ORR R12, R12, #(mode << 22)
1408            if mode != 0 {
1409                let orr = 0xE38C0000 | (rt << 12) | (0x05 << 8) | (mode as u32);
1410                bytes.extend_from_slice(&orr.to_le_bytes());
1411            }
1412
1413            // VMSR FPSCR, R12
1414            let vmsr = 0xEEE10A10 | (rt << 12);
1415            bytes.extend_from_slice(&vmsr.to_le_bytes());
1416
1417            // VCVT.S32.F64 S0, Dm — non-R variant (bit[7]=0), uses FPSCR rmode
1418            let vcvt_to_int = 0xEEBD0B40 | (m << 5) | vm;
1419            bytes.extend_from_slice(&vcvt_to_int.to_le_bytes());
1420
1421            // Restore FPSCR
1422            bytes.extend_from_slice(&vmrs.to_le_bytes());
1423            bytes.extend_from_slice(&bic.to_le_bytes());
1424            bytes.extend_from_slice(&vmsr.to_le_bytes());
1425        }
1426
1427        // VCVT.F64.S32 Dd, S0 (convert back to double)
1428        let vcvt_to_float = 0xEEB80B40 | (d << 22) | (vd << 12);
1429        bytes.extend_from_slice(&vcvt_to_float.to_le_bytes());
1430
1431        Ok(bytes)
1432    }
1433
1434    /// Encode F64 min/max as ARM32: VMOV + VCMP + VMRS + conditional VMOV
1435    fn encode_arm_f64_minmax(
1436        &self,
1437        dd: &VfpReg,
1438        dn: &VfpReg,
1439        dm: &VfpReg,
1440        is_min: bool,
1441    ) -> Result<Vec<u8>> {
1442        let mut bytes = Vec::new();
1443        let dn_num = vfp_dreg_to_num(dn)?;
1444        let dm_num = vfp_dreg_to_num(dm)?;
1445        let dd_num = vfp_dreg_to_num(dd)?;
1446
1447        // VMOV.F64 Dd, Dn (start with first operand)
1448        let (vd, d) = encode_dreg(dd_num);
1449        let (vn, n) = encode_dreg(dn_num);
1450        let vmov_dn = 0xEEB00B40 | (d << 22) | (vd << 12) | (n << 5) | vn;
1451        bytes.extend_from_slice(&vmov_dn.to_le_bytes());
1452
1453        // VCMP.F64 Dn, Dm
1454        let (vm, m) = encode_dreg(dm_num);
1455        let vcmp = 0xEEB40B40 | (n << 22) | (vn << 12) | (m << 5) | vm;
1456        bytes.extend_from_slice(&vcmp.to_le_bytes());
1457
1458        // VMRS APSR_nzcv, FPSCR
1459        bytes.extend_from_slice(&0xEEF1FA10u32.to_le_bytes());
1460
1461        let cond = if is_min { 0xCu32 } else { 0x4u32 };
1462        let vmov_cond = (cond << 28) | 0x0EB00B40 | (d << 22) | (vd << 12) | (m << 5) | vm;
1463        bytes.extend_from_slice(&vmov_cond.to_le_bytes());
1464
1465        Ok(bytes)
1466    }
1467
1468    /// Encode F64 copysign as ARM32
1469    fn encode_arm_f64_copysign(&self, dd: &VfpReg, dn: &VfpReg, dm: &VfpReg) -> Result<Vec<u8>> {
1470        let mut bytes = Vec::new();
1471
1472        // VMOV R0, R12, Dm (get sign source bits)
1473        let vmov_dm = encode_vmov_core_dreg(false, dm, &Reg::R0, &Reg::R12)?;
1474        bytes.extend_from_slice(&vmov_dm.to_le_bytes());
1475
1476        // VMOV R1, R2, Dn (get magnitude source bits)
1477        // We use R1 (lo) and R2 (hi) for the magnitude
1478        let vmov_dn = encode_vmov_core_dreg(false, dn, &Reg::R1, &Reg::R2)?;
1479        bytes.extend_from_slice(&vmov_dn.to_le_bytes());
1480
1481        // AND R12, R12, #0x80000000 (keep only sign bit from hi word)
1482        let and_sign = 0xE2000000u32 | (12 << 16) | (12 << 12) | (1 << 8) | 0x02;
1483        bytes.extend_from_slice(&and_sign.to_le_bytes());
1484
1485        // BIC R2, R2, #0x80000000 (clear sign bit from magnitude hi word)
1486        let bic_sign = 0xE3C00000u32 | (2 << 16) | (2 << 12) | (1 << 8) | 0x02;
1487        bytes.extend_from_slice(&bic_sign.to_le_bytes());
1488
1489        // ORR R2, R2, R12 (combine sign + magnitude)
1490        let orr = 0xE1800000u32 | (2 << 16) | (2 << 12) | 12;
1491        bytes.extend_from_slice(&orr.to_le_bytes());
1492
1493        // VMOV Dd, R1, R2
1494        let vmov_result = encode_vmov_core_dreg(true, dd, &Reg::R1, &Reg::R2)?;
1495        bytes.extend_from_slice(&vmov_result.to_le_bytes());
1496
1497        Ok(bytes)
1498    }
1499
1500    /// Encode VCVT.S32/U32.F32 + VMOV as ARM32
1501    fn encode_arm_i32_trunc_f32(&self, rd: &Reg, sm: &VfpReg, signed: bool) -> Result<Vec<u8>> {
1502        let mut bytes = Vec::new();
1503
1504        // VCVT.S32.F32 Sd, Sm (toward zero) or VCVT.U32.F32 Sd, Sm
1505        // We use Sm as both source and destination for the intermediate result
1506        let sm_num = vfp_sreg_to_num(sm)?;
1507        let (vd, d) = encode_sreg(sm_num);
1508        let (vm, m) = encode_sreg(sm_num);
1509        let base = if signed { 0xEEBD0AC0 } else { 0xEEBC0AC0 };
1510        let vcvt = base | (d << 22) | (vd << 12) | (m << 5) | vm;
1511        bytes.extend_from_slice(&vcvt.to_le_bytes());
1512
1513        // VMOV Rd, Sm — move result back to core register
1514        let vmov = encode_vmov_core_sreg(false, sm, rd)?;
1515        bytes.extend_from_slice(&vmov.to_le_bytes());
1516
1517        Ok(bytes)
1518    }
1519
1520    /// Encode an ARM instruction in Thumb-2 mode (16-bit or 32-bit instructions)
1521    fn encode_thumb(&self, op: &ArmOp) -> Result<Vec<u8>> {
1522        // Thumb-2 supports both 16-bit and 32-bit instructions
1523        // 32-bit instructions are encoded as two 16-bit halfwords (big-endian order)
1524        match op {
1525            // === 16-bit Thumb encodings ===
1526            ArmOp::Add { rd, rn, op2 } => {
1527                let rd_bits = reg_to_bits(rd) as u16;
1528                let rn_bits = reg_to_bits(rn) as u16;
1529
1530                if let Operand2::Reg(rm) = op2 {
1531                    let rm_bits = reg_to_bits(rm) as u16;
1532                    // 16-bit ADDS only has 3-bit register fields (R0-R7). For
1533                    // high registers (e.g. R12, the MemLoad/MemStore base
1534                    // scratch) the bits overflow into adjacent fields, silently
1535                    // corrupting the operands — issue #178/#180: `add ip,ip,r0`
1536                    // was emitted as `adds r4,r5,r1`. Guard on all three regs
1537                    // being low and fall back to 32-bit ADD.W otherwise, exactly
1538                    // as the Sub handler below does.
1539                    if rd_bits < 8 && rn_bits < 8 && rm_bits < 8 {
1540                        // ADDS Rd, Rn, Rm (16-bit): 0001 100 Rm Rn Rd
1541                        let instr: u16 = 0x1800 | (rm_bits << 6) | (rn_bits << 3) | rd_bits;
1542                        Ok(instr.to_le_bytes().to_vec())
1543                    } else {
1544                        // ADD.W Rd, Rn, Rm (32-bit) for high registers
1545                        self.encode_thumb32_add_reg_raw(
1546                            rd_bits as u32,
1547                            rn_bits as u32,
1548                            rm_bits as u32,
1549                        )
1550                    }
1551                } else if let Operand2::Imm(imm) = op2 {
1552                    if *imm <= 7 && rd_bits < 8 && rn_bits < 8 {
1553                        // ADDS Rd, Rn, #imm3 (16-bit): 0001 110 imm3 Rn Rd
1554                        let instr: u16 = 0x1C00 | ((*imm as u16) << 6) | (rn_bits << 3) | rd_bits;
1555                        Ok(instr.to_le_bytes().to_vec())
1556                    } else {
1557                        // Use 32-bit ADD for larger immediates
1558                        self.encode_thumb32_add(rd, rn, *imm as u32)
1559                    }
1560                } else {
1561                    // Fallback to 32-bit encoding
1562                    self.encode_thumb32_add(rd, rn, 0)
1563                }
1564            }
1565
1566            ArmOp::Sub { rd, rn, op2 } => {
1567                let rd_bits = reg_to_bits(rd) as u16;
1568                let rn_bits = reg_to_bits(rn) as u16;
1569
1570                if let Operand2::Reg(rm) = op2 {
1571                    let rm_bits = reg_to_bits(rm) as u16;
1572                    // 16-bit SUBS can only use low registers (R0-R7)
1573                    if rd_bits < 8 && rn_bits < 8 && rm_bits < 8 {
1574                        // SUBS Rd, Rn, Rm (16-bit): 0001 101 Rm Rn Rd
1575                        let instr: u16 = 0x1A00 | (rm_bits << 6) | (rn_bits << 3) | rd_bits;
1576                        Ok(instr.to_le_bytes().to_vec())
1577                    } else {
1578                        // Use 32-bit SUB.W for high registers
1579                        self.encode_thumb32_sub_reg_raw(
1580                            rd_bits as u32,
1581                            rn_bits as u32,
1582                            rm_bits as u32,
1583                        )
1584                    }
1585                } else if let Operand2::Imm(imm) = op2 {
1586                    if *imm <= 7 && rd_bits < 8 && rn_bits < 8 {
1587                        // SUBS Rd, Rn, #imm3 (16-bit): 0001 111 imm3 Rn Rd
1588                        let instr: u16 = 0x1E00 | ((*imm as u16) << 6) | (rn_bits << 3) | rd_bits;
1589                        Ok(instr.to_le_bytes().to_vec())
1590                    } else {
1591                        self.encode_thumb32_sub(rd, rn, *imm as u32)
1592                    }
1593                } else {
1594                    self.encode_thumb32_sub(rd, rn, 0)
1595                }
1596            }
1597
1598            ArmOp::Mov { rd, op2 } => {
1599                let rd_bits = reg_to_bits(rd) as u16;
1600
1601                if let Operand2::Imm(imm) = op2 {
1602                    if *imm <= 255 && rd_bits < 8 {
1603                        // MOVS Rd, #imm8 (16-bit): 0010 0 Rd imm8
1604                        let imm_bits = (*imm as u16) & 0xFF;
1605                        let instr: u16 = 0x2000 | (rd_bits << 8) | imm_bits;
1606                        Ok(instr.to_le_bytes().to_vec())
1607                    } else {
1608                        // Use 32-bit MOVW for larger immediates
1609                        self.encode_thumb32_movw(rd, *imm as u32)
1610                    }
1611                } else if let Operand2::Reg(rm) = op2 {
1612                    let rm_bits = reg_to_bits(rm) as u16;
1613                    // MOV Rd, Rm (16-bit): 0100 0110 D Rm Rd[2:0]
1614                    // D = Rd[3], Rd[2:0] in lower bits
1615                    let d_bit = (rd_bits >> 3) & 1;
1616                    let instr: u16 = 0x4600 | (d_bit << 7) | (rm_bits << 3) | (rd_bits & 0x7);
1617                    Ok(instr.to_le_bytes().to_vec())
1618                } else {
1619                    let instr: u16 = 0xBF00; // NOP fallback
1620                    Ok(instr.to_le_bytes().to_vec())
1621                }
1622            }
1623
1624            ArmOp::Push { regs } => {
1625                // Thumb-2 PUSH encoding:
1626                // If all regs in R0-R7 + LR, use 16-bit: 1011 010 M rrrrrrrr
1627                // Otherwise use 32-bit: STMDB SP!, {regs} = 1110 1001 0010 1101 | 0M0 reglist(13)
1628                let mut reg_list: u16 = 0;
1629                let mut need_32bit = false;
1630                for r in regs {
1631                    let bit = reg_to_bits(r);
1632                    if bit >= 8 && *r != Reg::LR {
1633                        need_32bit = true;
1634                    }
1635                    reg_list |= 1 << bit;
1636                }
1637                if !need_32bit {
1638                    // 16-bit PUSH: 1011 010 M rrrrrrrr
1639                    let m_bit = if reg_list & (1 << 14) != 0 {
1640                        1u16
1641                    } else {
1642                        0u16
1643                    };
1644                    let low_regs = reg_list & 0xFF;
1645                    let instr: u16 = 0xB400 | (m_bit << 8) | low_regs;
1646                    Ok(instr.to_le_bytes().to_vec())
1647                } else {
1648                    // 32-bit STMDB SP!, {regs}: E92D | reglist(16)
1649                    let hw1: u16 = 0xE92D;
1650                    let hw2: u16 = reg_list;
1651                    let mut bytes = hw1.to_le_bytes().to_vec();
1652                    bytes.extend_from_slice(&hw2.to_le_bytes());
1653                    Ok(bytes)
1654                }
1655            }
1656
1657            ArmOp::Pop { regs } => {
1658                // Thumb-2 POP encoding:
1659                // If all regs in R0-R7 + PC, use 16-bit: 1011 110 P rrrrrrrr
1660                // Otherwise use 32-bit: LDMIA SP!, {regs} = 1110 1000 1011 1101 | PM0 reglist(13)
1661                let mut reg_list: u16 = 0;
1662                let mut need_32bit = false;
1663                for r in regs {
1664                    let bit = reg_to_bits(r);
1665                    if bit >= 8 && *r != Reg::PC {
1666                        need_32bit = true;
1667                    }
1668                    reg_list |= 1 << bit;
1669                }
1670                if !need_32bit {
1671                    // 16-bit POP: 1011 110 P rrrrrrrr
1672                    let p_bit = if reg_list & (1 << 15) != 0 {
1673                        1u16
1674                    } else {
1675                        0u16
1676                    };
1677                    let low_regs = reg_list & 0xFF;
1678                    let instr: u16 = 0xBC00 | (p_bit << 8) | low_regs;
1679                    Ok(instr.to_le_bytes().to_vec())
1680                } else {
1681                    // 32-bit LDMIA SP!, {regs}: E8BD | reglist(16)
1682                    let hw1: u16 = 0xE8BD;
1683                    let hw2: u16 = reg_list;
1684                    let mut bytes = hw1.to_le_bytes().to_vec();
1685                    bytes.extend_from_slice(&hw2.to_le_bytes());
1686                    Ok(bytes)
1687                }
1688            }
1689
1690            ArmOp::Nop => {
1691                let instr: u16 = 0xBF00; // NOP in Thumb-2
1692                Ok(instr.to_le_bytes().to_vec())
1693            }
1694
1695            ArmOp::Udf { imm } => {
1696                // UDF (Undefined) in Thumb-2: 16-bit encoding is 0xDE00 | imm8
1697                // This triggers UsageFault/HardFault, used for WASM traps
1698                let instr: u16 = 0xDE00 | (*imm as u16);
1699                let bytes = instr.to_le_bytes().to_vec();
1700                encoding_contracts::verify_thumb16(&bytes);
1701                Ok(bytes)
1702            }
1703
1704            // i64 support: ADDS, ADC, SUBS, SBC for register pair arithmetic
1705            // ADDS sets flags (carry), ADC uses carry from previous ADDS
1706            ArmOp::Adds { rd, rn, op2 } => {
1707                let rd_bits = reg_to_bits(rd) as u16;
1708                let rn_bits = reg_to_bits(rn) as u16;
1709
1710                if let Operand2::Reg(rm) = op2 {
1711                    let rm_bits = reg_to_bits(rm) as u16;
1712                    // 16-bit ADDS is R0-R7 only; i64 pair allocation can place
1713                    // operands in R8-R11, which would overflow the 3-bit fields
1714                    // and corrupt the operands (#178/#180 class). Guard and fall
1715                    // back to 32-bit ADDS.W for high registers.
1716                    if rd_bits < 8 && rn_bits < 8 && rm_bits < 8 {
1717                        // ADDS Rd, Rn, Rm (16-bit): 0001 100 Rm Rn Rd
1718                        let instr: u16 = 0x1800 | (rm_bits << 6) | (rn_bits << 3) | rd_bits;
1719                        Ok(instr.to_le_bytes().to_vec())
1720                    } else {
1721                        self.encode_thumb32_adds_reg_raw(
1722                            rd_bits as u32,
1723                            rn_bits as u32,
1724                            rm_bits as u32,
1725                        )
1726                    }
1727                } else {
1728                    // 32-bit Thumb-2 ADDS with immediate
1729                    self.encode_thumb32_adds(rd, rn, 0)
1730                }
1731            }
1732
1733            // ADC: Add with Carry (Thumb-2 32-bit)
1734            // ADC.W Rd, Rn, Rm: EB40 Rn | 00 Rd 00 Rm
1735            ArmOp::Adc { rd, rn, op2 } => {
1736                let rd_bits = reg_to_bits(rd);
1737                let rn_bits = reg_to_bits(rn);
1738
1739                if let Operand2::Reg(rm) = op2 {
1740                    let rm_bits = reg_to_bits(rm);
1741                    // ADC.W Rd, Rn, Rm (T2): 1110 1011 0100 Rn | 0 000 Rd 00 00 Rm
1742                    let hw1: u16 = (0xEB40 | rn_bits) as u16;
1743                    let hw2: u16 = ((rd_bits << 8) | rm_bits) as u16;
1744
1745                    let mut bytes = hw1.to_le_bytes().to_vec();
1746                    bytes.extend_from_slice(&hw2.to_le_bytes());
1747                    Ok(bytes)
1748                } else {
1749                    // ADC with immediate - use 32-bit encoding
1750                    let hw1: u16 = (0xF140 | rn_bits) as u16;
1751                    let hw2: u16 = (rd_bits << 8) as u16;
1752                    let mut bytes = hw1.to_le_bytes().to_vec();
1753                    bytes.extend_from_slice(&hw2.to_le_bytes());
1754                    Ok(bytes)
1755                }
1756            }
1757
1758            // SUBS sets flags (borrow), SBC uses borrow from previous SUBS
1759            ArmOp::Subs { rd, rn, op2 } => {
1760                let rd_bits = reg_to_bits(rd) as u16;
1761                let rn_bits = reg_to_bits(rn) as u16;
1762
1763                if let Operand2::Reg(rm) = op2 {
1764                    let rm_bits = reg_to_bits(rm) as u16;
1765                    // 16-bit SUBS is R0-R7 only; high-register i64 pair operands
1766                    // would overflow the 3-bit fields (#178/#180 class). Guard
1767                    // and fall back to 32-bit SUBS.W for high registers.
1768                    if rd_bits < 8 && rn_bits < 8 && rm_bits < 8 {
1769                        // SUBS Rd, Rn, Rm (16-bit): 0001 101 Rm Rn Rd
1770                        let instr: u16 = 0x1A00 | (rm_bits << 6) | (rn_bits << 3) | rd_bits;
1771                        Ok(instr.to_le_bytes().to_vec())
1772                    } else {
1773                        self.encode_thumb32_subs_reg_raw(
1774                            rd_bits as u32,
1775                            rn_bits as u32,
1776                            rm_bits as u32,
1777                        )
1778                    }
1779                } else {
1780                    // 32-bit Thumb-2 SUBS with immediate
1781                    self.encode_thumb32_subs(rd, rn, 0)
1782                }
1783            }
1784
1785            // SBC: Subtract with Carry (Thumb-2 32-bit)
1786            // SBC.W Rd, Rn, Rm: EB60 Rn | 00 Rd 00 Rm
1787            ArmOp::Sbc { rd, rn, op2 } => {
1788                let rd_bits = reg_to_bits(rd);
1789                let rn_bits = reg_to_bits(rn);
1790
1791                if let Operand2::Reg(rm) = op2 {
1792                    let rm_bits = reg_to_bits(rm);
1793                    // SBC.W Rd, Rn, Rm (T2): 1110 1011 0110 Rn | 0 000 Rd 00 00 Rm
1794                    let hw1: u16 = (0xEB60 | rn_bits) as u16;
1795                    let hw2: u16 = ((rd_bits << 8) | rm_bits) as u16;
1796
1797                    let mut bytes = hw1.to_le_bytes().to_vec();
1798                    bytes.extend_from_slice(&hw2.to_le_bytes());
1799                    Ok(bytes)
1800                } else {
1801                    // SBC with immediate - use 32-bit encoding
1802                    let hw1: u16 = (0xF160 | rn_bits) as u16;
1803                    let hw2: u16 = (rd_bits << 8) as u16;
1804                    let mut bytes = hw1.to_le_bytes().to_vec();
1805                    bytes.extend_from_slice(&hw2.to_le_bytes());
1806                    Ok(bytes)
1807                }
1808            }
1809
1810            // === 32-bit Thumb-2 encodings ===
1811
1812            // SDIV: 11111011 1001 Rn 1111 Rd 1111 Rm
1813            ArmOp::Sdiv { rd, rn, rm } => {
1814                let rd_bits = reg_to_bits(rd);
1815                let rn_bits = reg_to_bits(rn);
1816                let rm_bits = reg_to_bits(rm);
1817                reg_bits_checked(rd_bits)?;
1818                reg_bits_checked(rn_bits)?;
1819                reg_bits_checked(rm_bits)?;
1820
1821                // Thumb-2 SDIV: FB90 F0F0 | Rn<<16 | Rd<<8 | Rm
1822                // First halfword: 1111 1011 1001 Rn = 0xFB90 | Rn
1823                // Second halfword: 1111 Rd 1111 Rm = 0xF0F0 | Rd<<8 | Rm
1824                let hw1: u16 = (0xFB90 | rn_bits) as u16;
1825                let hw2: u16 = (0xF0F0 | (rd_bits << 8) | rm_bits) as u16;
1826
1827                // Thumb-2 32-bit instructions: first halfword, then second halfword (little-endian each)
1828                let mut bytes = hw1.to_le_bytes().to_vec();
1829                bytes.extend_from_slice(&hw2.to_le_bytes());
1830                encoding_contracts::verify_thumb32(&bytes);
1831                Ok(bytes)
1832            }
1833
1834            // UDIV: 11111011 1011 Rn 1111 Rd 1111 Rm
1835            ArmOp::Udiv { rd, rn, rm } => {
1836                let rd_bits = reg_to_bits(rd);
1837                let rn_bits = reg_to_bits(rn);
1838                let rm_bits = reg_to_bits(rm);
1839                reg_bits_checked(rd_bits)?;
1840                reg_bits_checked(rn_bits)?;
1841                reg_bits_checked(rm_bits)?;
1842
1843                // Thumb-2 UDIV: FBB0 F0F0 | Rn<<16 | Rd<<8 | Rm
1844                let hw1: u16 = (0xFBB0 | rn_bits) as u16;
1845                let hw2: u16 = (0xF0F0 | (rd_bits << 8) | rm_bits) as u16;
1846
1847                let mut bytes = hw1.to_le_bytes().to_vec();
1848                bytes.extend_from_slice(&hw2.to_le_bytes());
1849                encoding_contracts::verify_thumb32(&bytes);
1850                Ok(bytes)
1851            }
1852
1853            ArmOp::Umull { rdlo, rdhi, rn, rm } => {
1854                let rdlo_bits = reg_to_bits(rdlo);
1855                let rdhi_bits = reg_to_bits(rdhi);
1856                let rn_bits = reg_to_bits(rn);
1857                let rm_bits = reg_to_bits(rm);
1858                reg_bits_checked(rdlo_bits)?;
1859                reg_bits_checked(rdhi_bits)?;
1860                reg_bits_checked(rn_bits)?;
1861                reg_bits_checked(rm_bits)?;
1862
1863                // Thumb-2 UMULL: 1111 1011 1010 Rn | RdLo RdHi 0000 Rm
1864                let hw1: u16 = (0xFBA0 | rn_bits) as u16;
1865                let hw2: u16 = ((rdlo_bits << 12) | (rdhi_bits << 8) | rm_bits) as u16;
1866
1867                let mut bytes = hw1.to_le_bytes().to_vec();
1868                bytes.extend_from_slice(&hw2.to_le_bytes());
1869                encoding_contracts::verify_thumb32(&bytes);
1870                Ok(bytes)
1871            }
1872
1873            // MUL (Thumb-2 32-bit): MUL Rd, Rn, Rm
1874            ArmOp::Mul { rd, rn, rm } => {
1875                let rd_bits = reg_to_bits(rd);
1876                let rn_bits = reg_to_bits(rn);
1877                let rm_bits = reg_to_bits(rm);
1878
1879                // Thumb-2 MUL: FB00 F000 | Rn | Rd<<8 | Rm
1880                // 11111011 0000 Rn | 1111 Rd 0000 Rm
1881                let hw1: u16 = (0xFB00 | rn_bits) as u16;
1882                let hw2: u16 = (0xF000 | (rd_bits << 8) | rm_bits) as u16;
1883
1884                let mut bytes = hw1.to_le_bytes().to_vec();
1885                bytes.extend_from_slice(&hw2.to_le_bytes());
1886                Ok(bytes)
1887            }
1888
1889            // MLS: Rd = Ra - Rn * Rm
1890            ArmOp::Mls { rd, rn, rm, ra } => {
1891                let rd_bits = reg_to_bits(rd);
1892                let rn_bits = reg_to_bits(rn);
1893                let rm_bits = reg_to_bits(rm);
1894                let ra_bits = reg_to_bits(ra);
1895
1896                // Thumb-2 MLS: FB00 Rn | Ra Rd 0001 Rm
1897                // 11111011 0000 Rn | Ra Rd 0001 Rm
1898                let hw1: u16 = (0xFB00 | rn_bits) as u16;
1899                let hw2: u16 = ((ra_bits << 12) | (rd_bits << 8) | 0x10 | rm_bits) as u16;
1900
1901                let mut bytes = hw1.to_le_bytes().to_vec();
1902                bytes.extend_from_slice(&hw2.to_le_bytes());
1903                Ok(bytes)
1904            }
1905
1906            ArmOp::Mla { rd, rn, rm, ra } => {
1907                let rd_bits = reg_to_bits(rd);
1908                let rn_bits = reg_to_bits(rn);
1909                let rm_bits = reg_to_bits(rm);
1910                let ra_bits = reg_to_bits(ra);
1911
1912                // Thumb-2 MLA: FB00 Rn | Ra Rd 0000 Rm — same as MLS without the
1913                // bit-4 (0x10) op flag. rd = ra + rn*rm.
1914                let hw1: u16 = (0xFB00 | rn_bits) as u16;
1915                let hw2: u16 = ((ra_bits << 12) | (rd_bits << 8) | rm_bits) as u16;
1916
1917                let mut bytes = hw1.to_le_bytes().to_vec();
1918                bytes.extend_from_slice(&hw2.to_le_bytes());
1919                Ok(bytes)
1920            }
1921
1922            // AND (Thumb-2 32-bit)
1923            ArmOp::And { rd, rn, op2 } => {
1924                if let Operand2::Reg(rm) = op2 {
1925                    let rd_bits = reg_to_bits(rd);
1926                    let rn_bits = reg_to_bits(rn);
1927                    let rm_bits = reg_to_bits(rm);
1928
1929                    // Thumb-2 AND register: EA00 Rn | 0 Rd 00 00 Rm
1930                    let hw1: u16 = (0xEA00 | rn_bits) as u16;
1931                    let hw2: u16 = ((rd_bits << 8) | rm_bits) as u16;
1932
1933                    let mut bytes = hw1.to_le_bytes().to_vec();
1934                    bytes.extend_from_slice(&hw2.to_le_bytes());
1935                    Ok(bytes)
1936                } else if let Operand2::Imm(imm) = op2 {
1937                    let rd_bits = reg_to_bits(rd);
1938                    let rn_bits = reg_to_bits(rn);
1939
1940                    // Thumb-2 AND.W immediate T1: 11110 i 0 0000 S Rn | 0 imm3 Rd imm8.
1941                    // The i:imm3:imm8 field is a ThumbExpandImm modified immediate —
1942                    // encode it correctly (or error on an un-encodable value)
1943                    // rather than packing raw bits, closing the silent-miscompile
1944                    // class for AND alongside ORR/EOR (#251) / ADD/SUB (#253) /
1945                    // CMP (#255).
1946                    let field = try_thumb_expand_imm(*imm as u32).ok_or_else(|| {
1947                        synth_core::Error::synthesis(
1948                            "AND immediate is not a valid ThumbExpandImm — materialize into a register",
1949                        )
1950                    })?;
1951                    let i_bit = (field >> 11) & 1;
1952                    let imm3 = (field >> 8) & 0x7;
1953                    let imm8 = field & 0xFF;
1954
1955                    let hw1: u16 = (0xF000 | (i_bit << 10) | rn_bits) as u16;
1956                    let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
1957
1958                    let mut bytes = hw1.to_le_bytes().to_vec();
1959                    bytes.extend_from_slice(&hw2.to_le_bytes());
1960                    Ok(bytes)
1961                } else {
1962                    // RegShift variant - fallback to NOP
1963                    let instr: u16 = 0xBF00;
1964                    Ok(instr.to_le_bytes().to_vec())
1965                }
1966            }
1967
1968            // ORR (Thumb-2 32-bit)
1969            ArmOp::Orr { rd, rn, op2 } => {
1970                if let Operand2::Reg(rm) = op2 {
1971                    let rd_bits = reg_to_bits(rd);
1972                    let rn_bits = reg_to_bits(rn);
1973                    let rm_bits = reg_to_bits(rm);
1974
1975                    // Thumb-2 ORR: EA40 Rn | 0 Rd 00 00 Rm
1976                    let hw1: u16 = (0xEA40 | rn_bits) as u16;
1977                    let hw2: u16 = ((rd_bits << 8) | rm_bits) as u16;
1978
1979                    let mut bytes = hw1.to_le_bytes().to_vec();
1980                    bytes.extend_from_slice(&hw2.to_le_bytes());
1981                    Ok(bytes)
1982                } else if let Operand2::Imm(imm) = op2 {
1983                    // ORR.W immediate T1: 11110 i 0 0010 S Rn | 0 imm3 Rd imm8.
1984                    // Only the zero-extended byte form (imm <= 0xFF) is encoded;
1985                    // larger modified immediates need ThumbExpandImm — return an
1986                    // error rather than silently emit a NOP (Ok-or-Err, #180/#185).
1987                    let imm_val = *imm as u32;
1988                    if imm_val > 0xFF {
1989                        return Err(synth_core::Error::synthesis(
1990                            "ORR immediate > 0xFF requires ThumbExpandImm (not yet implemented)",
1991                        ));
1992                    }
1993                    let rd_bits = reg_to_bits(rd);
1994                    let rn_bits = reg_to_bits(rn);
1995                    let hw1: u16 = (0xF040 | rn_bits) as u16;
1996                    let hw2: u16 = ((rd_bits << 8) | (imm_val & 0xFF)) as u16;
1997                    let mut bytes = hw1.to_le_bytes().to_vec();
1998                    bytes.extend_from_slice(&hw2.to_le_bytes());
1999                    Ok(bytes)
2000                } else {
2001                    let instr: u16 = 0xBF00;
2002                    Ok(instr.to_le_bytes().to_vec())
2003                }
2004            }
2005
2006            // EOR (Thumb-2 32-bit)
2007            ArmOp::Eor { rd, rn, op2 } => {
2008                if let Operand2::Reg(rm) = op2 {
2009                    let rd_bits = reg_to_bits(rd);
2010                    let rn_bits = reg_to_bits(rn);
2011                    let rm_bits = reg_to_bits(rm);
2012
2013                    // Thumb-2 EOR: EA80 Rn | 0 Rd 00 00 Rm
2014                    let hw1: u16 = (0xEA80 | rn_bits) as u16;
2015                    let hw2: u16 = ((rd_bits << 8) | rm_bits) as u16;
2016
2017                    let mut bytes = hw1.to_le_bytes().to_vec();
2018                    bytes.extend_from_slice(&hw2.to_le_bytes());
2019                    Ok(bytes)
2020                } else if let Operand2::Imm(imm) = op2 {
2021                    // EOR.W immediate T1: 11110 i 0 0100 S Rn | 0 imm3 Rd imm8.
2022                    // Byte form only (imm <= 0xFF); larger needs ThumbExpandImm —
2023                    // error, not a silent NOP (Ok-or-Err, #180/#185).
2024                    let imm_val = *imm as u32;
2025                    if imm_val > 0xFF {
2026                        return Err(synth_core::Error::synthesis(
2027                            "EOR immediate > 0xFF requires ThumbExpandImm (not yet implemented)",
2028                        ));
2029                    }
2030                    let rd_bits = reg_to_bits(rd);
2031                    let rn_bits = reg_to_bits(rn);
2032                    let hw1: u16 = (0xF080 | rn_bits) as u16;
2033                    let hw2: u16 = ((rd_bits << 8) | (imm_val & 0xFF)) as u16;
2034                    let mut bytes = hw1.to_le_bytes().to_vec();
2035                    bytes.extend_from_slice(&hw2.to_le_bytes());
2036                    Ok(bytes)
2037                } else {
2038                    let instr: u16 = 0xBF00;
2039                    Ok(instr.to_le_bytes().to_vec())
2040                }
2041            }
2042
2043            // Shift operations (16-bit for low registers)
2044            ArmOp::Lsl { rd, rn, shift } => {
2045                let rd_bits = reg_to_bits(rd) as u16;
2046                let rn_bits = reg_to_bits(rn) as u16;
2047                let shift_bits = (*shift as u16) & 0x1F;
2048
2049                if rd_bits < 8 && rn_bits < 8 {
2050                    // LSLS Rd, Rm, #imm5 (16-bit): 0000 0 imm5 Rm Rd
2051                    let instr: u16 = (shift_bits << 6) | (rn_bits << 3) | rd_bits;
2052                    Ok(instr.to_le_bytes().to_vec())
2053                } else {
2054                    // Use 32-bit encoding for high registers
2055                    self.encode_thumb32_shift(rd, rn, *shift, 0b00) // LSL type
2056                }
2057            }
2058
2059            ArmOp::Lsr { rd, rn, shift } => {
2060                let rd_bits = reg_to_bits(rd) as u16;
2061                let rn_bits = reg_to_bits(rn) as u16;
2062                let shift_bits = (*shift as u16) & 0x1F;
2063
2064                if rd_bits < 8 && rn_bits < 8 && shift_bits > 0 {
2065                    // LSRS Rd, Rm, #imm5 (16-bit): 0000 1 imm5 Rm Rd
2066                    let instr: u16 = 0x0800 | (shift_bits << 6) | (rn_bits << 3) | rd_bits;
2067                    Ok(instr.to_le_bytes().to_vec())
2068                } else {
2069                    self.encode_thumb32_shift(rd, rn, *shift, 0b01) // LSR type
2070                }
2071            }
2072
2073            ArmOp::Asr { rd, rn, shift } => {
2074                let rd_bits = reg_to_bits(rd) as u16;
2075                let rn_bits = reg_to_bits(rn) as u16;
2076                let shift_bits = (*shift as u16) & 0x1F;
2077
2078                if rd_bits < 8 && rn_bits < 8 && shift_bits > 0 {
2079                    // ASRS Rd, Rm, #imm5 (16-bit): 0001 0 imm5 Rm Rd
2080                    let instr: u16 = 0x1000 | (shift_bits << 6) | (rn_bits << 3) | rd_bits;
2081                    Ok(instr.to_le_bytes().to_vec())
2082                } else {
2083                    self.encode_thumb32_shift(rd, rn, *shift, 0b10) // ASR type
2084                }
2085            }
2086
2087            ArmOp::Ror { rd, rn, shift } => {
2088                // ROR doesn't have a 16-bit immediate form, use 32-bit
2089                self.encode_thumb32_shift(rd, rn, *shift, 0b11) // ROR type
2090            }
2091
2092            // Register-based shifts (Thumb-2 32-bit)
2093            // Encoding: 11111010 0xxS Rn 1111 Rd 0000 Rm
2094            // xx = shift type: 00=LSL, 01=LSR, 10=ASR, 11=ROR
2095            ArmOp::LslReg { rd, rn, rm } => self.encode_thumb32_shift_reg(rd, rn, rm, 0b00),
2096            ArmOp::LsrReg { rd, rn, rm } => self.encode_thumb32_shift_reg(rd, rn, rm, 0b01),
2097            ArmOp::AsrReg { rd, rn, rm } => self.encode_thumb32_shift_reg(rd, rn, rm, 0b10),
2098            ArmOp::RorReg { rd, rn, rm } => self.encode_thumb32_shift_reg(rd, rn, rm, 0b11),
2099
2100            // RSB (Reverse Subtract): Rd = imm - Rn
2101            // Thumb-2 T2 encoding: 11110 i 0 1110 S Rn | 0 imm3 Rd imm8
2102            ArmOp::Rsb { rd, rn, imm } => {
2103                let rd_bits = reg_to_bits(rd);
2104                let rn_bits = reg_to_bits(rn);
2105                let imm_val = *imm;
2106
2107                let i_bit = (imm_val >> 11) & 1;
2108                let imm3 = (imm_val >> 8) & 0x7;
2109                let imm8 = imm_val & 0xFF;
2110
2111                // hw1: 11110 i 01110 0 Rn  (S=0)
2112                let hw1: u16 = (0xF1C0 | (i_bit << 10) | rn_bits) as u16;
2113                // hw2: 0 imm3 Rd imm8
2114                let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
2115
2116                let mut bytes = hw1.to_le_bytes().to_vec();
2117                bytes.extend_from_slice(&hw2.to_le_bytes());
2118                Ok(bytes)
2119            }
2120
2121            // CLZ (Thumb-2 32-bit)
2122            ArmOp::Clz { rd, rm } => {
2123                let rd_bits = reg_to_bits(rd);
2124                let rm_bits = reg_to_bits(rm);
2125
2126                // Thumb-2 CLZ: FAB0 Rm | F8 Rd Rm
2127                // 11111010 1011 Rm | 1111 1000 Rd Rm
2128                let hw1: u16 = (0xFAB0 | rm_bits) as u16;
2129                let hw2: u16 = (0xF080 | (rd_bits << 8) | rm_bits) as u16;
2130
2131                let mut bytes = hw1.to_le_bytes().to_vec();
2132                bytes.extend_from_slice(&hw2.to_le_bytes());
2133                Ok(bytes)
2134            }
2135
2136            // RBIT (Thumb-2 32-bit)
2137            ArmOp::Rbit { rd, rm } => {
2138                let rd_bits = reg_to_bits(rd);
2139                let rm_bits = reg_to_bits(rm);
2140
2141                // Thumb-2 RBIT: FA90 Rm | F0 Rd A0 Rm
2142                // 11111010 1001 Rm | 1111 Rd 1010 Rm
2143                let hw1: u16 = (0xFA90 | rm_bits) as u16;
2144                let hw2: u16 = (0xF0A0 | (rd_bits << 8) | rm_bits) as u16;
2145
2146                let mut bytes = hw1.to_le_bytes().to_vec();
2147                bytes.extend_from_slice(&hw2.to_le_bytes());
2148                Ok(bytes)
2149            }
2150
2151            // SXTB (16-bit for low registers)
2152            ArmOp::Sxtb { rd, rm } => {
2153                let rd_bits = reg_to_bits(rd) as u16;
2154                let rm_bits = reg_to_bits(rm) as u16;
2155
2156                if rd_bits < 8 && rm_bits < 8 {
2157                    // SXTB Rd, Rm (16-bit): 1011 0010 01 Rm Rd
2158                    let instr: u16 = 0xB240 | (rm_bits << 3) | rd_bits;
2159                    Ok(instr.to_le_bytes().to_vec())
2160                } else {
2161                    // Thumb-2 SXTB.W: FA4F F(rd)80 (rm)
2162                    // 11111010 0100 1111 | 1111 Rd 10 rotate Rm
2163                    let rd_bits32 = rd_bits as u32;
2164                    let rm_bits32 = rm_bits as u32;
2165                    let hw1: u16 = 0xFA4F;
2166                    let hw2: u16 = (0xF080 | (rd_bits32 << 8) | rm_bits32) as u16;
2167                    let mut bytes = hw1.to_le_bytes().to_vec();
2168                    bytes.extend_from_slice(&hw2.to_le_bytes());
2169                    Ok(bytes)
2170                }
2171            }
2172
2173            // SXTH (16-bit for low registers)
2174            ArmOp::Sxth { rd, rm } => {
2175                let rd_bits = reg_to_bits(rd) as u16;
2176                let rm_bits = reg_to_bits(rm) as u16;
2177
2178                if rd_bits < 8 && rm_bits < 8 {
2179                    // SXTH Rd, Rm (16-bit): 1011 0010 00 Rm Rd
2180                    let instr: u16 = 0xB200 | (rm_bits << 3) | rd_bits;
2181                    Ok(instr.to_le_bytes().to_vec())
2182                } else {
2183                    // Thumb-2 SXTH.W: FA0F F(rd)80 (rm)
2184                    // 11111010 0000 1111 | 1111 Rd 10 rotate Rm
2185                    let rd_bits32 = rd_bits as u32;
2186                    let rm_bits32 = rm_bits as u32;
2187                    let hw1: u16 = 0xFA0F;
2188                    let hw2: u16 = (0xF080 | (rd_bits32 << 8) | rm_bits32) as u16;
2189                    let mut bytes = hw1.to_le_bytes().to_vec();
2190                    bytes.extend_from_slice(&hw2.to_le_bytes());
2191                    Ok(bytes)
2192                }
2193            }
2194
2195            // UXTB Rd,Rm — zero-extend byte (rd = rm & 0xff)
2196            ArmOp::Uxtb { rd, rm } => {
2197                let rd_bits = reg_to_bits(rd) as u16;
2198                let rm_bits = reg_to_bits(rm) as u16;
2199                if rd_bits < 8 && rm_bits < 8 {
2200                    // UXTB Rd, Rm (16-bit): 1011 0010 11 Rm Rd
2201                    let instr: u16 = 0xB2C0 | (rm_bits << 3) | rd_bits;
2202                    Ok(instr.to_le_bytes().to_vec())
2203                } else {
2204                    // Thumb-2 UXTB.W: FA5F F(rd)80 (rm)
2205                    let hw1: u16 = 0xFA5F;
2206                    let hw2: u16 = (0xF080 | ((rd_bits as u32) << 8) | rm_bits as u32) as u16;
2207                    let mut bytes = hw1.to_le_bytes().to_vec();
2208                    bytes.extend_from_slice(&hw2.to_le_bytes());
2209                    Ok(bytes)
2210                }
2211            }
2212
2213            // UXTH Rd,Rm — zero-extend halfword (rd = rm & 0xffff)
2214            ArmOp::Uxth { rd, rm } => {
2215                let rd_bits = reg_to_bits(rd) as u16;
2216                let rm_bits = reg_to_bits(rm) as u16;
2217                if rd_bits < 8 && rm_bits < 8 {
2218                    // UXTH Rd, Rm (16-bit): 1011 0010 10 Rm Rd
2219                    let instr: u16 = 0xB280 | (rm_bits << 3) | rd_bits;
2220                    Ok(instr.to_le_bytes().to_vec())
2221                } else {
2222                    // Thumb-2 UXTH.W: FA1F F(rd)80 (rm)
2223                    let hw1: u16 = 0xFA1F;
2224                    let hw2: u16 = (0xF080 | ((rd_bits as u32) << 8) | rm_bits as u32) as u16;
2225                    let mut bytes = hw1.to_le_bytes().to_vec();
2226                    bytes.extend_from_slice(&hw2.to_le_bytes());
2227                    Ok(bytes)
2228                }
2229            }
2230
2231            // CMP (can be 16-bit for low registers)
2232            ArmOp::Cmp { rn, op2 } => {
2233                let rn_bits = reg_to_bits(rn) as u16;
2234
2235                if let Operand2::Imm(imm) = op2 {
2236                    // Only use 16-bit encoding for non-negative immediates 0-255
2237                    // Negative immediates must use 32-bit encoding
2238                    if *imm >= 0 && *imm <= 255 && rn_bits < 8 {
2239                        // CMP Rn, #imm8 (16-bit): 0010 1 Rn imm8
2240                        let instr: u16 = 0x2800 | (rn_bits << 8) | (*imm as u16 & 0xFF);
2241                        Ok(instr.to_le_bytes().to_vec())
2242                    } else {
2243                        self.encode_thumb32_cmp_imm(rn, *imm as u32)
2244                    }
2245                } else if let Operand2::Reg(rm) = op2 {
2246                    let rm_bits = reg_to_bits(rm) as u16;
2247                    if rn_bits < 8 && rm_bits < 8 {
2248                        // CMP Rn, Rm (16-bit low): 0100 0010 10 Rm Rn
2249                        let instr: u16 = 0x4280 | (rm_bits << 3) | rn_bits;
2250                        Ok(instr.to_le_bytes().to_vec())
2251                    } else {
2252                        // CMP Rn, Rm (16-bit high): 0100 0101 N Rm Rn[2:0]
2253                        let n_bit = (rn_bits >> 3) & 1;
2254                        let instr: u16 = 0x4500 | (n_bit << 7) | (rm_bits << 3) | (rn_bits & 0x7);
2255                        Ok(instr.to_le_bytes().to_vec())
2256                    }
2257                } else {
2258                    let instr: u16 = 0xBF00;
2259                    Ok(instr.to_le_bytes().to_vec())
2260                }
2261            }
2262
2263            // CMN (Compare Negative) - computes Rn + op2 and sets flags
2264            // CMN Rn, #1 sets Z flag if Rn == -1 (since -1 + 1 = 0)
2265            ArmOp::Cmn { rn, op2 } => {
2266                let rn_bits = reg_to_bits(rn) as u16;
2267
2268                if let Operand2::Imm(imm) = op2 {
2269                    // CMN.W Rn, #imm (32-bit): i:imm3:imm8 is a ThumbExpandImm
2270                    // modified immediate (the field sits in imm3=hw2[14:12],
2271                    // imm8=hw2[7:0], i=hw1[10]). Encode it correctly, or error on
2272                    // an un-encodable value — replacing the old silent `0xBF00`
2273                    // NOP (the last of the silent-miscompile data-proc encoders).
2274                    let field = try_thumb_expand_imm(*imm as u32).ok_or_else(|| {
2275                        synth_core::Error::synthesis(
2276                            "CMN immediate is not a valid ThumbExpandImm — materialize into a register",
2277                        )
2278                    })?;
2279                    let i_bit = (field >> 11) & 1;
2280                    let imm3 = (field >> 8) & 0x7;
2281                    let imm8 = field & 0xFF;
2282                    let hw1: u16 = (0xF110 | (i_bit << 10) as u16) | rn_bits;
2283                    let hw2: u16 = (imm3 << 12) as u16 | 0x0F00 | imm8 as u16;
2284                    let mut bytes = hw1.to_le_bytes().to_vec();
2285                    bytes.extend_from_slice(&hw2.to_le_bytes());
2286                    Ok(bytes)
2287                } else if let Operand2::Reg(rm) = op2 {
2288                    let rm_bits = reg_to_bits(rm) as u16;
2289                    // 16-bit CMN (T1) only encodes R0-R7; high registers overflow
2290                    // the 3-bit fields and corrupt the operands (#184, the #180
2291                    // class). CMN has no high-register 16-bit form, so fall back
2292                    // to 32-bit CMN.W (T2): EB10 Rn | 0F00 Rm (ADD.W with S=1 and
2293                    // Rd discarded as PC/1111).
2294                    if rn_bits < 8 && rm_bits < 8 {
2295                        // CMN Rn, Rm (16-bit): 0100 0010 11 Rm Rn
2296                        let instr: u16 = 0x42C0 | (rm_bits << 3) | rn_bits;
2297                        Ok(instr.to_le_bytes().to_vec())
2298                    } else {
2299                        let hw1: u16 = 0xEB10 | rn_bits;
2300                        let hw2: u16 = 0x0F00 | rm_bits;
2301                        let mut bytes = hw1.to_le_bytes().to_vec();
2302                        bytes.extend_from_slice(&hw2.to_le_bytes());
2303                        Ok(bytes)
2304                    }
2305                } else {
2306                    Ok(vec![0xBF, 0x00])
2307                }
2308            }
2309
2310            // LDR (can be 16-bit for simple cases)
2311            ArmOp::Ldr { rd, addr } => {
2312                let rd_bits = reg_to_bits(rd);
2313                let base_bits = reg_to_bits(&addr.base);
2314
2315                // Handle register offset mode [base, Roff] or [base, Roff, #imm]
2316                if let Some(offset_reg) = &addr.offset_reg {
2317                    let rm_bits = reg_to_bits(offset_reg);
2318
2319                    // If there's also an immediate offset, we need to ADD it first
2320                    if addr.offset != 0 {
2321                        // Use R12 (IP) as scratch to avoid clobbering the address register
2322                        // ADD R12, Rm, #offset; LDR Rd, [base, R12]
2323                        let scratch = Reg::R12;
2324                        let mut bytes =
2325                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2326                        bytes.extend(self.encode_thumb32_ldr_reg(rd, &addr.base, &scratch)?);
2327                        return Ok(bytes);
2328                    }
2329
2330                    // Simple register offset: LDR Rd, [Rn, Rm]
2331                    // 16-bit: only if Rd, Rn, Rm < R8
2332                    if rd_bits < 8 && base_bits < 8 && rm_bits < 8 {
2333                        // LDR Rd, [Rn, Rm] (16-bit): 0101 100 Rm Rn Rd
2334                        let instr: u16 = 0x5800
2335                            | ((rm_bits as u16) << 6)
2336                            | ((base_bits as u16) << 3)
2337                            | (rd_bits as u16);
2338                        return Ok(instr.to_le_bytes().to_vec());
2339                    }
2340
2341                    // 32-bit register offset
2342                    return self.encode_thumb32_ldr_reg(rd, &addr.base, offset_reg);
2343                }
2344
2345                // Immediate offset mode [base, #imm]
2346                let offset = addr.offset as u32;
2347
2348                if rd_bits < 8 && base_bits < 8 && (offset & 0x3) == 0 && offset <= 124 {
2349                    // LDR Rd, [Rn, #imm5*4] (16-bit): 0110 1 imm5 Rn Rd
2350                    let imm5 = (offset >> 2) as u16;
2351                    let instr: u16 =
2352                        0x6800 | (imm5 << 6) | ((base_bits as u16) << 3) | (rd_bits as u16);
2353                    Ok(instr.to_le_bytes().to_vec())
2354                } else {
2355                    self.encode_thumb32_ldr(rd, &addr.base, offset)
2356                }
2357            }
2358
2359            // STR (can be 16-bit for simple cases)
2360            ArmOp::Str { rd, addr } => {
2361                let rd_bits = reg_to_bits(rd);
2362                let base_bits = reg_to_bits(&addr.base);
2363
2364                // Handle register offset mode [base, Roff] or [base, Roff, #imm]
2365                if let Some(offset_reg) = &addr.offset_reg {
2366                    let rm_bits = reg_to_bits(offset_reg);
2367
2368                    // If there's also an immediate offset, we need to ADD it first
2369                    if addr.offset != 0 {
2370                        // Use R12 (IP) as scratch to avoid clobbering the address register
2371                        // ADD R12, Rm, #offset; STR Rd, [base, R12]
2372                        let scratch = Reg::R12;
2373                        let mut bytes =
2374                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2375                        bytes.extend(self.encode_thumb32_str_reg(rd, &addr.base, &scratch)?);
2376                        return Ok(bytes);
2377                    }
2378
2379                    // Simple register offset: STR Rd, [Rn, Rm]
2380                    // 16-bit: only if Rd, Rn, Rm < R8
2381                    if rd_bits < 8 && base_bits < 8 && rm_bits < 8 {
2382                        // STR Rd, [Rn, Rm] (16-bit): 0101 000 Rm Rn Rd
2383                        let instr: u16 = 0x5000
2384                            | ((rm_bits as u16) << 6)
2385                            | ((base_bits as u16) << 3)
2386                            | (rd_bits as u16);
2387                        return Ok(instr.to_le_bytes().to_vec());
2388                    }
2389
2390                    // 32-bit register offset
2391                    return self.encode_thumb32_str_reg(rd, &addr.base, offset_reg);
2392                }
2393
2394                // Immediate offset mode [base, #imm]
2395                let offset = addr.offset as u32;
2396
2397                if rd_bits < 8 && base_bits < 8 && (offset & 0x3) == 0 && offset <= 124 {
2398                    // STR Rd, [Rn, #imm5*4] (16-bit): 0110 0 imm5 Rn Rd
2399                    let imm5 = (offset >> 2) as u16;
2400                    let instr: u16 =
2401                        0x6000 | (imm5 << 6) | ((base_bits as u16) << 3) | (rd_bits as u16);
2402                    Ok(instr.to_le_bytes().to_vec())
2403                } else {
2404                    self.encode_thumb32_str(rd, &addr.base, offset)
2405                }
2406            }
2407
2408            // LDRB (Thumb-2)
2409            ArmOp::Ldrb { rd, addr } => {
2410                let rd_bits = reg_to_bits(rd);
2411                let base_bits = reg_to_bits(&addr.base);
2412
2413                if let Some(offset_reg) = &addr.offset_reg {
2414                    if addr.offset != 0 {
2415                        let scratch = Reg::R12;
2416                        let mut bytes =
2417                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2418                        bytes.extend(self.encode_thumb32_ldrb_reg(rd, &addr.base, &scratch)?);
2419                        return Ok(bytes);
2420                    }
2421                    return self.encode_thumb32_ldrb_reg(rd, &addr.base, offset_reg);
2422                }
2423
2424                let offset = addr.offset as u32;
2425                if rd_bits < 8 && base_bits < 8 && offset <= 31 {
2426                    // LDRB Rd, [Rn, #imm5] (16-bit): 0111 1 imm5 Rn Rd
2427                    let instr: u16 = 0x7800
2428                        | ((offset as u16) << 6)
2429                        | ((base_bits as u16) << 3)
2430                        | (rd_bits as u16);
2431                    Ok(instr.to_le_bytes().to_vec())
2432                } else {
2433                    self.encode_thumb32_ldrb_imm(rd, &addr.base, offset)
2434                }
2435            }
2436
2437            // LDRSB (Thumb-2)
2438            ArmOp::Ldrsb { rd, addr } => {
2439                let rd_bits = reg_to_bits(rd);
2440                let base_bits = reg_to_bits(&addr.base);
2441
2442                if let Some(offset_reg) = &addr.offset_reg {
2443                    if addr.offset != 0 {
2444                        let scratch = Reg::R12;
2445                        let mut bytes =
2446                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2447                        bytes.extend(self.encode_thumb32_ldrsb_reg(rd, &addr.base, &scratch)?);
2448                        return Ok(bytes);
2449                    }
2450                    return self.encode_thumb32_ldrsb_reg(rd, &addr.base, offset_reg);
2451                }
2452
2453                let offset = addr.offset as u32;
2454                // LDRSB has no 16-bit immediate form (only register)
2455                // For 16-bit reg form: only if Rd, Rn, Rm < R8
2456                if rd_bits < 8 && base_bits < 8 && offset == 0 {
2457                    // No immediate 16-bit encoding for LDRSB; use 32-bit
2458                    self.encode_thumb32_ldrsb_imm(rd, &addr.base, offset)
2459                } else {
2460                    self.encode_thumb32_ldrsb_imm(rd, &addr.base, offset)
2461                }
2462            }
2463
2464            // LDRH (Thumb-2)
2465            ArmOp::Ldrh { rd, addr } => {
2466                let rd_bits = reg_to_bits(rd);
2467                let base_bits = reg_to_bits(&addr.base);
2468
2469                if let Some(offset_reg) = &addr.offset_reg {
2470                    if addr.offset != 0 {
2471                        let scratch = Reg::R12;
2472                        let mut bytes =
2473                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2474                        bytes.extend(self.encode_thumb32_ldrh_reg(rd, &addr.base, &scratch)?);
2475                        return Ok(bytes);
2476                    }
2477                    return self.encode_thumb32_ldrh_reg(rd, &addr.base, offset_reg);
2478                }
2479
2480                let offset = addr.offset as u32;
2481                if rd_bits < 8 && base_bits < 8 && (offset & 0x1) == 0 && offset <= 62 {
2482                    // LDRH Rd, [Rn, #imm5*2] (16-bit): 1000 1 imm5 Rn Rd
2483                    let imm5 = (offset >> 1) as u16;
2484                    let instr: u16 =
2485                        0x8800 | (imm5 << 6) | ((base_bits as u16) << 3) | (rd_bits as u16);
2486                    Ok(instr.to_le_bytes().to_vec())
2487                } else {
2488                    self.encode_thumb32_ldrh_imm(rd, &addr.base, offset)
2489                }
2490            }
2491
2492            // LDRSH (Thumb-2)
2493            ArmOp::Ldrsh { rd, addr } => {
2494                if let Some(offset_reg) = &addr.offset_reg {
2495                    if addr.offset != 0 {
2496                        let scratch = Reg::R12;
2497                        let mut bytes =
2498                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2499                        bytes.extend(self.encode_thumb32_ldrsh_reg(rd, &addr.base, &scratch)?);
2500                        return Ok(bytes);
2501                    }
2502                    return self.encode_thumb32_ldrsh_reg(rd, &addr.base, offset_reg);
2503                }
2504
2505                let offset = addr.offset as u32;
2506                self.encode_thumb32_ldrsh_imm(rd, &addr.base, offset)
2507            }
2508
2509            // STRB (Thumb-2)
2510            ArmOp::Strb { rd, addr } => {
2511                let rd_bits = reg_to_bits(rd);
2512                let base_bits = reg_to_bits(&addr.base);
2513
2514                if let Some(offset_reg) = &addr.offset_reg {
2515                    if addr.offset != 0 {
2516                        let scratch = Reg::R12;
2517                        let mut bytes =
2518                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2519                        bytes.extend(self.encode_thumb32_strb_reg(rd, &addr.base, &scratch)?);
2520                        return Ok(bytes);
2521                    }
2522                    return self.encode_thumb32_strb_reg(rd, &addr.base, offset_reg);
2523                }
2524
2525                let offset = addr.offset as u32;
2526                if rd_bits < 8 && base_bits < 8 && offset <= 31 {
2527                    // STRB Rd, [Rn, #imm5] (16-bit): 0111 0 imm5 Rn Rd
2528                    let instr: u16 = 0x7000
2529                        | ((offset as u16) << 6)
2530                        | ((base_bits as u16) << 3)
2531                        | (rd_bits as u16);
2532                    Ok(instr.to_le_bytes().to_vec())
2533                } else {
2534                    self.encode_thumb32_strb_imm(rd, &addr.base, offset)
2535                }
2536            }
2537
2538            // STRH (Thumb-2)
2539            ArmOp::Strh { rd, addr } => {
2540                let rd_bits = reg_to_bits(rd);
2541                let base_bits = reg_to_bits(&addr.base);
2542
2543                if let Some(offset_reg) = &addr.offset_reg {
2544                    if addr.offset != 0 {
2545                        let scratch = Reg::R12;
2546                        let mut bytes =
2547                            self.encode_thumb32_add_imm(&scratch, offset_reg, addr.offset as u32)?;
2548                        bytes.extend(self.encode_thumb32_strh_reg(rd, &addr.base, &scratch)?);
2549                        return Ok(bytes);
2550                    }
2551                    return self.encode_thumb32_strh_reg(rd, &addr.base, offset_reg);
2552                }
2553
2554                let offset = addr.offset as u32;
2555                if rd_bits < 8 && base_bits < 8 && (offset & 0x1) == 0 && offset <= 62 {
2556                    // STRH Rd, [Rn, #imm5*2] (16-bit): 1000 0 imm5 Rn Rd
2557                    let imm5 = (offset >> 1) as u16;
2558                    let instr: u16 =
2559                        0x8000 | (imm5 << 6) | ((base_bits as u16) << 3) | (rd_bits as u16);
2560                    Ok(instr.to_le_bytes().to_vec())
2561                } else {
2562                    self.encode_thumb32_strh_imm(rd, &addr.base, offset)
2563                }
2564            }
2565
2566            // MemorySize (Thumb-2)
2567            ArmOp::MemorySize { rd } => {
2568                // LSR rd, R10, #16 — memory size in bytes / 65536 = pages
2569                // Thumb-2 16-bit: LSRS Rd, Rm, #imm5 — 0000 1 imm5 Rm Rd
2570                let rd_bits = reg_to_bits(rd);
2571                let r10_bits = reg_to_bits(&Reg::R10);
2572                if rd_bits < 8 && r10_bits < 8 {
2573                    let instr: u16 =
2574                        0x0800 | (16u16 << 6) | ((r10_bits as u16) << 3) | (rd_bits as u16);
2575                    Ok(instr.to_le_bytes().to_vec())
2576                } else {
2577                    // Thumb-2 32-bit LSR: 1110 1010 010 0 1111 | 0 imm3 Rd imm2 01 Rm
2578                    let imm5: u32 = 16;
2579                    let imm3 = (imm5 >> 2) & 0x7;
2580                    let imm2 = imm5 & 0x3;
2581                    let hw1: u16 = 0xEA4F;
2582                    let hw2: u16 =
2583                        ((imm3 << 12) | (rd_bits << 8) | (imm2 << 6) | 0x10 | r10_bits) as u16;
2584                    let mut bytes = hw1.to_le_bytes().to_vec();
2585                    bytes.extend_from_slice(&hw2.to_le_bytes());
2586                    Ok(bytes)
2587                }
2588            }
2589
2590            // MemoryGrow (Thumb-2)
2591            ArmOp::MemoryGrow { rd, .. } => {
2592                // On embedded with fixed memory, always return -1 (failure)
2593                // MVN rd, #0 → MOV rd, #-1
2594                // Thumb-2 32-bit: MVN: 1111 0 i 0 0 0 1 1 0 1111 | 0 imm3 Rd imm8
2595                let rd_bits = reg_to_bits(rd);
2596                let hw1: u16 = 0xF06F; // MVN with i=0
2597                let hw2: u16 = (rd_bits << 8) as u16; // imm8=0 → ~0 = 0xFFFFFFFF = -1
2598                let mut bytes = hw1.to_le_bytes().to_vec();
2599                bytes.extend_from_slice(&hw2.to_le_bytes());
2600                Ok(bytes)
2601            }
2602
2603            // BX (16-bit)
2604            ArmOp::Bx { rm } => {
2605                let rm_bits = reg_to_bits(rm) as u16;
2606                // BX Rm (16-bit): 0100 0111 0 Rm 000
2607                let instr: u16 = 0x4700 | (rm_bits << 3);
2608                Ok(instr.to_le_bytes().to_vec())
2609            }
2610
2611            // BLX (16-bit) - Branch with Link and Exchange
2612            // BLX Rm: 0100 0111 1 Rm 000
2613            ArmOp::Blx { rm } => {
2614                let rm_bits = reg_to_bits(rm) as u16;
2615                let instr: u16 = 0x4780 | (rm_bits << 3);
2616                Ok(instr.to_le_bytes().to_vec())
2617            }
2618
2619            // CallIndirect - indirect function call via table lookup
2620            // table_index_reg contains the table index
2621            // Generates: LSL R12, idx, #2; LDR R12, [R12, table_base]; BLX R12
2622            ArmOp::CallIndirect {
2623                rd: _,
2624                type_idx: _,
2625                table_index_reg,
2626            } => {
2627                let idx_reg = reg_to_bits(table_index_reg);
2628                let mut bytes = Vec::new();
2629
2630                // For now, we generate code that:
2631                // 1. Multiplies index by 4 (function pointer size)
2632                // 2. Loads function pointer from table (assumes table base in R11)
2633                // 3. Calls the function via BLX
2634                //
2635                // Table base setup must be done by caller/runtime.
2636                // This is a simplified implementation - full support needs:
2637                // - Table base address resolution
2638                // - Type signature checking
2639                // - Bounds checking
2640
2641                // LSL R12, idx_reg, #2 (multiply index by 4)
2642                // Thumb-2 MOV with shift: 11101010 010 S 1111 | 0 imm3 Rd imm2 type Rm
2643                // LSL: type=00, imm5=2 -> imm3=0, imm2=10
2644                let hw1: u16 = 0xEA4F_u16; // MOV.W R12, Rm, LSL #2
2645                let hw2: u16 = ((0x0C00 | (0b10 << 4)) | idx_reg) as u16;
2646                bytes.extend_from_slice(&hw1.to_le_bytes());
2647                bytes.extend_from_slice(&hw2.to_le_bytes());
2648
2649                // LDR R12, [R11, R12] - load function pointer
2650                // Thumb-2 LDR (register): 1111 1000 0101 Rn | Rt 0000 00 imm2 Rm
2651                // Rn=R11, Rt=R12, Rm=R12, imm2=00 (no shift)
2652                let ldr_hw1: u16 = 0xF85B; // LDR.W Rt, [R11, Rm]
2653                let ldr_hw2: u16 = 0xC00C; // Rt=R12, imm2=00, Rm=R12
2654                bytes.extend_from_slice(&ldr_hw1.to_le_bytes());
2655                bytes.extend_from_slice(&ldr_hw2.to_le_bytes());
2656
2657                // BLX R12 (call function indirectly)
2658                // BLX Rm (16-bit): 0100 0111 1 Rm 000
2659                let blx: u16 = 0x47E0; // BLX R12
2660                bytes.extend_from_slice(&blx.to_le_bytes());
2661
2662                Ok(bytes)
2663            }
2664
2665            // Label pseudo-instruction: emits no machine code
2666            ArmOp::Label { .. } => Ok(Vec::new()),
2667
2668            // Conditional branch to label (generic) - offset 0, will be patched
2669            ArmOp::Bcc { cond, label: _ } => {
2670                use synth_synthesis::Condition;
2671                let cond_bits: u16 = match cond {
2672                    Condition::EQ => 0x0,
2673                    Condition::NE => 0x1,
2674                    Condition::HS => 0x2,
2675                    Condition::LO => 0x3,
2676                    Condition::HI => 0x8,
2677                    Condition::LS => 0x9,
2678                    Condition::GE => 0xA,
2679                    Condition::LT => 0xB,
2680                    Condition::GT => 0xC,
2681                    Condition::LE => 0xD,
2682                };
2683                // 16-bit B<cond> with offset 0: 1101 cond imm8
2684                let instr: u16 = 0xD000 | (cond_bits << 8);
2685                Ok(instr.to_le_bytes().to_vec())
2686            }
2687
2688            // Branch instructions
2689            ArmOp::B { label: _ } => {
2690                // Simplified: B.N with offset 0
2691                // For real usage, would need label resolution
2692                let instr: u16 = 0xE000; // B.N #0
2693                Ok(instr.to_le_bytes().to_vec())
2694            }
2695
2696            // BHS (Branch if Higher or Same) - used for bounds checking
2697            // Condition code: 0x2 (C set)
2698            ArmOp::Bhs { label: _ } => {
2699                // 16-bit B<cond> with offset 0: 1101 cond imm8
2700                // cond = 0x2 (HS)
2701                let instr: u16 = 0xD200; // BHS.N #0
2702                Ok(instr.to_le_bytes().to_vec())
2703            }
2704
2705            // BLO (Branch if Lower) - complementary to BHS
2706            // Condition code: 0x3 (C clear)
2707            ArmOp::Blo { label: _ } => {
2708                // 16-bit B<cond> with offset 0: 1101 cond imm8
2709                // cond = 0x3 (LO)
2710                let instr: u16 = 0xD300; // BLO.N #0
2711                Ok(instr.to_le_bytes().to_vec())
2712            }
2713
2714            // Branch with numeric offset (Thumb-2)
2715            // Thumb-2 B.W instruction: 32-bit with +-16MB range
2716            ArmOp::BOffset { offset } => {
2717                // offset is already the halfword displacement: (target - branch - 4) / 2
2718                // This is the raw encoded value, accounting for variable-length instructions
2719                let halfword_offset = *offset;
2720
2721                // 16-bit B.N encoding: 1110 0 imm11 (11-bit signed halfword offset)
2722                // Range: -1024 to +1022 halfwords
2723                if (-1024..=1022).contains(&halfword_offset) {
2724                    // 16-bit B.N encoding: 1110 0 imm11
2725                    let imm11 = (halfword_offset as u16) & 0x7FF;
2726                    let instr: u16 = 0xE000 | imm11;
2727                    Ok(instr.to_le_bytes().to_vec())
2728                } else {
2729                    // 32-bit B.W encoding for larger offsets
2730                    // First halfword: 1111 0 S imm10
2731                    // Second halfword: 10 J1 0 J2 imm11
2732                    // Total offset = SignExtend(S:I1:I2:imm10:imm11:0)
2733                    // where I1 = NOT(J1 XOR S), I2 = NOT(J2 XOR S)
2734
2735                    // The B.W (T4) encoding packs the signed offset as:
2736                    //   S:I1:I2:imm10:imm11:0  (25-bit signed, halfword-aligned)
2737                    // where J1 = NOT(I1 XOR S), J2 = NOT(I2 XOR S)
2738                    // Input halfword_offset already equals (target - PC - 4) / 2,
2739                    // so the full byte offset = halfword_offset << 1.
2740                    // The encoding fields split that 25-bit signed value (including the
2741                    // implicit trailing zero) as: S | imm10 | imm11
2742                    // with I1 = bit 23 and I2 = bit 22 of the signed offset.
2743                    let signed_offset = halfword_offset << 1; // byte offset
2744                    let s = if signed_offset < 0 { 1u32 } else { 0u32 };
2745                    let uoffset = signed_offset as u32;
2746                    let imm10 = (uoffset >> 12) & 0x3FF; // bits [21:12]
2747                    let imm11 = (uoffset >> 1) & 0x7FF; // bits [11:1]
2748                    let i1 = (uoffset >> 23) & 1; // bit 23
2749                    let i2 = (uoffset >> 22) & 1; // bit 22
2750                    let j1 = (!(i1 ^ s)) & 1; // J1 = NOT(I1 XOR S)
2751                    let j2 = (!(i2 ^ s)) & 1; // J2 = NOT(I2 XOR S)
2752
2753                    let hw1: u16 = (0xF000 | (s << 10) | imm10) as u16;
2754                    let hw2: u16 = (0x9000 | (j1 << 13) | (j2 << 11) | imm11) as u16;
2755
2756                    let mut bytes = hw1.to_le_bytes().to_vec();
2757                    bytes.extend_from_slice(&hw2.to_le_bytes());
2758                    Ok(bytes)
2759                }
2760            }
2761
2762            // Conditional branch with numeric offset (Thumb-2)
2763            ArmOp::BCondOffset { cond, offset } => {
2764                use synth_synthesis::Condition;
2765                let cond_bits: u16 = match cond {
2766                    Condition::EQ => 0x0,
2767                    Condition::NE => 0x1,
2768                    Condition::HS => 0x2,
2769                    Condition::LO => 0x3,
2770                    Condition::HI => 0x8,
2771                    Condition::LS => 0x9,
2772                    Condition::GE => 0xA,
2773                    Condition::LT => 0xB,
2774                    Condition::GT => 0xC,
2775                    Condition::LE => 0xD,
2776                };
2777
2778                // offset is already the halfword displacement: (target - branch - 4) / 2
2779                // This is the raw imm8 value for 16-bit B<cond> encoding
2780                let halfword_offset = *offset;
2781
2782                // 16-bit B<cond> encoding: 1101 cond imm8
2783                // Range: -256 to +254 halfwords (imm8 is sign-extended and shifted left 1)
2784                if (-128..=127).contains(&halfword_offset) {
2785                    let imm8 = (halfword_offset as u16) & 0xFF;
2786                    let instr: u16 = 0xD000 | (cond_bits << 8) | imm8;
2787                    Ok(instr.to_le_bytes().to_vec())
2788                } else {
2789                    // 32-bit B<cond>.W for larger offsets
2790                    // First halfword: 1111 0 S cond imm6
2791                    // Second halfword: 10 J1 0 J2 imm11
2792                    let offset = halfword_offset >> 1;
2793                    let s = if offset < 0 { 1u32 } else { 0u32 };
2794                    let imm6 = ((offset >> 11) as u32) & 0x3F;
2795                    let imm11 = (offset as u32) & 0x7FF;
2796                    let j1 = if s == 1 { 1 } else { 0 };
2797                    let j2 = if s == 1 { 1 } else { 0 };
2798
2799                    let hw1: u16 = (0xF000 | (s << 10) | ((cond_bits as u32) << 6) | imm6) as u16;
2800                    let hw2: u16 = (0x8000 | (j1 << 13) | (j2 << 11) | imm11) as u16;
2801
2802                    let mut bytes = hw1.to_le_bytes().to_vec();
2803                    bytes.extend_from_slice(&hw2.to_le_bytes());
2804                    Ok(bytes)
2805                }
2806            }
2807
2808            ArmOp::Bl { label: _ } => {
2809                // BL is always 32-bit in Thumb-2, encoded here as a relocatable
2810                // placeholder; an R_ARM_THM_CALL relocation patches the target
2811                // (see arm_backend.rs). The placeholder must carry an embedded
2812                // addend of -4 so the relocation nets to exactly the symbol S.
2813                //
2814                // Thumb BL computes `target = (P + 4) + signed_offset`. Under
2815                // R_ARM_THM_CALL the linker resolves using the in-place addend;
2816                // a 0xF800 placeholder (addend 0) lands at S+4 — every call one
2817                // instruction past the callee entry (#174). The correct
2818                // placeholder is what `gas` emits for `bl <extern>`:
2819                //   f7ff fffe  ->  `bl <self>`  (S=1, J1=J2=1, imm = -4 addend),
2820                // i.e. hw1=0xF7FF, hw2=0xFFFE. This nets to S, not S+4.
2821                // (The earlier 0xD000 was worse still — a ~+0x600000 addend,
2822                // the garbage `bl c0000c` and "truncated to fit" of #167.)
2823                let hw1: u16 = 0xF7FF;
2824                let hw2: u16 = 0xFFFE;
2825                let mut bytes = hw1.to_le_bytes().to_vec();
2826                bytes.extend_from_slice(&hw2.to_le_bytes());
2827                Ok(bytes)
2828            }
2829
2830            // MVN
2831            ArmOp::Mvn { rd, op2 } => {
2832                if let Operand2::Reg(rm) = op2 {
2833                    let rd_bits = reg_to_bits(rd) as u16;
2834                    let rm_bits = reg_to_bits(rm) as u16;
2835
2836                    if rd_bits < 8 && rm_bits < 8 {
2837                        // MVNS Rd, Rm (16-bit): 0100 0011 11 Rm Rd
2838                        let instr: u16 = 0x43C0 | (rm_bits << 3) | rd_bits;
2839                        Ok(instr.to_le_bytes().to_vec())
2840                    } else {
2841                        // 32-bit MVN
2842                        let hw1: u16 = 0xEA6F_u16;
2843                        let hw2: u16 = ((reg_to_bits(rd) << 8) | reg_to_bits(rm)) as u16;
2844                        let mut bytes = hw1.to_le_bytes().to_vec();
2845                        bytes.extend_from_slice(&hw2.to_le_bytes());
2846                        Ok(bytes)
2847                    }
2848                } else {
2849                    let instr: u16 = 0xBF00;
2850                    Ok(instr.to_le_bytes().to_vec())
2851                }
2852            }
2853
2854            // MOVW - Move Wide (Thumb-2 32-bit)
2855            ArmOp::Movw { rd, imm16 } => {
2856                self.encode_thumb32_movw_raw(reg_to_bits(rd), *imm16 as u32)
2857            }
2858
2859            // MOVT - Move Top (Thumb-2 32-bit)
2860            ArmOp::Movt { rd, imm16 } => {
2861                self.encode_thumb32_movt_raw(reg_to_bits(rd), *imm16 as u32)
2862            }
2863
2864            // #237: symbol-relative MOVW/MOVT. Encode the addend's low/high 16
2865            // bits in place; the backend records an R_ARM_MOVW_ABS_NC /
2866            // R_ARM_MOVT_ABS relocation against `symbol`, so the linker adds the
2867            // symbol's final address to the in-place addend (REL semantics).
2868            ArmOp::MovwSym { rd, addend, .. } => {
2869                self.encode_thumb32_movw_raw(reg_to_bits(rd), (*addend as u32) & 0xffff)
2870            }
2871            ArmOp::MovtSym { rd, addend, .. } => {
2872                self.encode_thumb32_movt_raw(reg_to_bits(rd), ((*addend as u32) >> 16) & 0xffff)
2873            }
2874
2875            // #345: literal-pool address load — emit a PLACEHOLDER `LDR.W rd,
2876            // [pc, #0]` (U=1, imm12=0). The backend (arm_backend.rs) places the
2877            // 4-byte pool word at the end of the function, records the R_ARM_ABS32
2878            // relocation against `symbol+addend`, and patches the imm12 with the
2879            // real PC-relative distance once the pool offset is known.
2880            // Encoding T2: 1111 1000 1101 1111 | Rt(4) imm12(12), with the literal
2881            // base = Align(PC,4) and PC = address of this instruction + 4.
2882            ArmOp::LdrSym { rd, .. } => {
2883                let rt = reg_to_bits(rd) as u16;
2884                let hw1: u16 = 0xF8DF; // LDR.W (literal), U=1
2885                let hw2: u16 = rt << 12; // imm12 = 0 placeholder
2886                let mut bytes = Vec::with_capacity(4);
2887                bytes.extend_from_slice(&hw1.to_le_bytes());
2888                bytes.extend_from_slice(&hw2.to_le_bytes());
2889                Ok(bytes)
2890            }
2891
2892            // SetCond: Materialize condition flag into register (0 or 1)
2893            // Strategy: ITE <cond>; MOV Rd, #1; MOV Rd, #0
2894            // IMPORTANT: Must use ITE (If-Then-Else) because 16-bit Thumb MOV
2895            // always sets flags (MOVS). We need to evaluate the condition BEFORE
2896            // any MOV instruction clobbers the flags from CMP.
2897            ArmOp::SetCond { rd, cond } => {
2898                let rd_bits = reg_to_bits(rd) as u16;
2899
2900                // Condition code encoding for IT block
2901                use synth_synthesis::Condition;
2902                let cond_bits: u16 = match cond {
2903                    Condition::EQ => 0x0,
2904                    Condition::NE => 0x1,
2905                    Condition::LT => 0xB,
2906                    Condition::LE => 0xD,
2907                    Condition::GT => 0xC,
2908                    Condition::GE => 0xA,
2909                    Condition::LO => 0x3, // CC/LO (unsigned <)
2910                    Condition::LS => 0x9, // LS (unsigned <=)
2911                    Condition::HI => 0x8, // HI (unsigned >)
2912                    Condition::HS => 0x2, // CS/HS (unsigned >=)
2913                };
2914
2915                // ITE <cond>: encodes If-Then-Else block
2916                // The mask field depends on firstcond[0]:
2917                // - If firstcond[0] = 0: mask = 0xC for TE pattern (ITE EQ = BF0C)
2918                // - If firstcond[0] = 1: mask = 0x4 for TE pattern (ITE NE = BF14)
2919                let mask = if (cond_bits & 1) == 0 { 0xC } else { 0x4 };
2920                let ite_instr: u16 = 0xBF00 | (cond_bits << 4) | mask;
2921
2922                // Materialize 0/1 into Rd. The 16-bit MOVS (T1) encodes Rd in a
2923                // 3-bit field (bits[10:8]) — only R0–R7. For a high register
2924                // (R8–R12) `rd_bits << 8` overflows into bit 11 and silently
2925                // turns MOVS into CMP (00100 → 00101), corrupting the result
2926                // (this mis-materialized gale's `has_waiter`, so its `local.set`
2927                // stored a stale register → the binary-sem WAKE dispatch read
2928                // garbage). Use the 32-bit MOV.W (T2) for high registers, which
2929                // has a 4-bit Rd field. MOV.W with S=0 doesn't set flags, which
2930                // is fine inside the ITE (the materialized value is the result;
2931                // the flags are not consumed afterwards).
2932                let mut bytes = ite_instr.to_le_bytes().to_vec();
2933                let push_mov = |bytes: &mut Vec<u8>, imm: u16| {
2934                    if rd_bits <= 7 {
2935                        let m: u16 = 0x2000 | (rd_bits << 8) | imm; // 16-bit MOVS Rd,#imm
2936                        bytes.extend_from_slice(&m.to_le_bytes());
2937                    } else {
2938                        // 32-bit MOV.W Rd, #imm (T2): F04F | (Rd<<8) | imm8
2939                        let hw1: u16 = 0xF04F;
2940                        let hw2: u16 = (rd_bits << 8) | imm;
2941                        bytes.extend_from_slice(&hw1.to_le_bytes());
2942                        bytes.extend_from_slice(&hw2.to_le_bytes());
2943                    }
2944                };
2945                push_mov(&mut bytes, 1); // Then branch (condition true)  → 1
2946                push_mov(&mut bytes, 0); // Else branch (condition false) → 0
2947                Ok(bytes)
2948            }
2949
2950            // I64SetCond: Compare two i64 register pairs, result 0/1 in rd
2951            // EQ/NE: CMP lo,lo; IT EQ; CMPEQ hi,hi; ITE <cond>; MOV 1; MOV 0
2952            // LT: CMP lo,lo; SBCS rd,hi,hi; ITE LT; MOV 1; MOV 0
2953            // GT: CMP lo,lo (swapped); SBCS rd,hi,hi (swapped); ITE LT; MOV 1; MOV 0
2954            ArmOp::I64SetCond {
2955                rd,
2956                rn_lo,
2957                rn_hi,
2958                rm_lo,
2959                rm_hi,
2960                cond,
2961            } => {
2962                use synth_synthesis::Condition;
2963                let rd_bits = reg_to_bits(rd) as u16;
2964                let mut bytes = Vec::new();
2965
2966                // Helper: encode CMP Rn, Rm (16-bit)
2967                let encode_cmp_reg = |rn: &synth_synthesis::Reg,
2968                                      rm: &synth_synthesis::Reg|
2969                 -> Vec<u8> {
2970                    let rn_bits = reg_to_bits(rn) as u16;
2971                    let rm_bits = reg_to_bits(rm) as u16;
2972                    if rn_bits < 8 && rm_bits < 8 {
2973                        let instr: u16 = 0x4280 | (rm_bits << 3) | rn_bits;
2974                        instr.to_le_bytes().to_vec()
2975                    } else {
2976                        let n_bit = (rn_bits >> 3) & 1;
2977                        let instr: u16 = 0x4500 | (n_bit << 7) | (rm_bits << 3) | (rn_bits & 0x7);
2978                        instr.to_le_bytes().to_vec()
2979                    }
2980                };
2981
2982                // Helper: encode ITE <cond> (2 bytes)
2983                let encode_ite = |cond_bits: u16| -> Vec<u8> {
2984                    let mask = if (cond_bits & 1) == 0 { 0xC } else { 0x4 };
2985                    let ite_instr: u16 = 0xBF00 | (cond_bits << 4) | mask;
2986                    ite_instr.to_le_bytes().to_vec()
2987                };
2988
2989                // Helper: encode SetCond (ITE + MOV #1 + MOV #0) for given condition
2990                let encode_setcond = |cond_bits: u16, rd_bits: u16| -> Vec<u8> {
2991                    let mut b = encode_ite(cond_bits);
2992                    if rd_bits < 8 {
2993                        let mov_one: u16 = 0x2001 | (rd_bits << 8);
2994                        let mov_zero: u16 = 0x2000 | (rd_bits << 8);
2995                        b.extend_from_slice(&mov_one.to_le_bytes());
2996                        b.extend_from_slice(&mov_zero.to_le_bytes());
2997                    } else {
2998                        // #311: rd >= R8 — the 16-bit MOV imm8 form has a 3-bit
2999                        // rd field; rd_bits<<8 overflows into bit 11 and
3000                        // TRANSMUTES the MOV into CMP (0x2001|0x0800 = 0x2801 =
3001                        // CMP r0,#1): the boolean dies in the flags and the
3002                        // consumer reads a stale register. Use the 32-bit
3003                        // MOV.W (T2: F04F 0000|rd<<8|imm8) — IT-legal,
3004                        // flag-preserving. Same class as H-CODE-9 / #180.
3005                        for imm in [1u16, 0u16] {
3006                            let hw1: u16 = 0xF04F;
3007                            let hw2: u16 = (rd_bits << 8) | imm;
3008                            b.extend_from_slice(&hw1.to_le_bytes());
3009                            b.extend_from_slice(&hw2.to_le_bytes());
3010                        }
3011                    }
3012                    b
3013                };
3014
3015                match cond {
3016                    Condition::EQ | Condition::NE => {
3017                        // CMP rn_lo, rm_lo (compare low words)
3018                        bytes.extend_from_slice(&encode_cmp_reg(rn_lo, rm_lo));
3019
3020                        // IT EQ (execute next instruction only if Z=1)
3021                        let it_eq: u16 = 0xBF08; // IT EQ: cond=0000, mask=1000
3022                        bytes.extend_from_slice(&it_eq.to_le_bytes());
3023
3024                        // CMPEQ rn_hi, rm_hi (compare high words, only if low equal)
3025                        bytes.extend_from_slice(&encode_cmp_reg(rn_hi, rm_hi));
3026
3027                        // ITE <cond>; MOV rd, #1; MOV rd, #0
3028                        let cond_bits: u16 = match cond {
3029                            Condition::EQ => 0x0,
3030                            Condition::NE => 0x1,
3031                            _ => unreachable!(),
3032                        };
3033                        bytes.extend_from_slice(&encode_setcond(cond_bits, rd_bits));
3034                    }
3035
3036                    Condition::LT => {
3037                        // CMP rn_lo, rm_lo (sets C flag for borrow)
3038                        bytes.extend_from_slice(&encode_cmp_reg(rn_lo, rm_lo));
3039
3040                        // SBCS rd, rn_hi, rm_hi (subtract with carry, sets N,V flags)
3041                        // SBCS.W Rd, Rn, Rm: EB70 Rn | 0000 Rd 0000 Rm
3042                        let rn_hi_bits = reg_to_bits(rn_hi);
3043                        let rm_hi_bits = reg_to_bits(rm_hi);
3044                        let hw1: u16 = (0xEB70 | rn_hi_bits) as u16;
3045                        let hw2: u16 = ((rd_bits as u32) << 8 | rm_hi_bits) as u16;
3046                        bytes.extend_from_slice(&hw1.to_le_bytes());
3047                        bytes.extend_from_slice(&hw2.to_le_bytes());
3048
3049                        // ITE LT; MOV rd, #1; MOV rd, #0
3050                        bytes.extend_from_slice(&encode_setcond(0xB, rd_bits)); // LT = 0xB
3051                    }
3052
3053                    Condition::GT => {
3054                        // GT(a,b) = LT(b,a): swap operands
3055                        // CMP rm_lo, rn_lo (swapped)
3056                        bytes.extend_from_slice(&encode_cmp_reg(rm_lo, rn_lo));
3057
3058                        // SBCS rd, rm_hi, rn_hi (swapped)
3059                        let rm_hi_bits = reg_to_bits(rm_hi);
3060                        let rn_hi_bits = reg_to_bits(rn_hi);
3061                        let hw1: u16 = (0xEB70 | rm_hi_bits) as u16;
3062                        let hw2: u16 = ((rd_bits as u32) << 8 | rn_hi_bits) as u16;
3063                        bytes.extend_from_slice(&hw1.to_le_bytes());
3064                        bytes.extend_from_slice(&hw2.to_le_bytes());
3065
3066                        // ITE LT; MOV rd, #1; MOV rd, #0
3067                        bytes.extend_from_slice(&encode_setcond(0xB, rd_bits)); // LT = 0xB
3068                    }
3069
3070                    Condition::LE => {
3071                        // LE(a,b) = !GT(a,b): use GT logic but invert result
3072                        // GT(a,b) = LT(b,a): so we do CMP(b,a) and check LT, then invert
3073                        // CMP rm_lo, rn_lo (swapped, same as GT)
3074                        bytes.extend_from_slice(&encode_cmp_reg(rm_lo, rn_lo));
3075
3076                        // SBCS rd, rm_hi, rn_hi (swapped)
3077                        let rm_hi_bits = reg_to_bits(rm_hi);
3078                        let rn_hi_bits = reg_to_bits(rn_hi);
3079                        let hw1: u16 = (0xEB70 | rm_hi_bits) as u16;
3080                        let hw2: u16 = ((rd_bits as u32) << 8 | rn_hi_bits) as u16;
3081                        bytes.extend_from_slice(&hw1.to_le_bytes());
3082                        bytes.extend_from_slice(&hw2.to_le_bytes());
3083
3084                        // ITE GE; MOV rd, #1; MOV rd, #0 (GE is !LT, so inverting GT result)
3085                        bytes.extend_from_slice(&encode_setcond(0xA, rd_bits)); // GE = 0xA
3086                    }
3087
3088                    Condition::GE => {
3089                        // GE(a,b) = !LT(a,b): use LT logic but invert result
3090                        // CMP rn_lo, rm_lo (same as LT)
3091                        bytes.extend_from_slice(&encode_cmp_reg(rn_lo, rm_lo));
3092
3093                        // SBCS rd, rn_hi, rm_hi (same as LT)
3094                        let rn_hi_bits = reg_to_bits(rn_hi);
3095                        let rm_hi_bits = reg_to_bits(rm_hi);
3096                        let hw1: u16 = (0xEB70 | rn_hi_bits) as u16;
3097                        let hw2: u16 = ((rd_bits as u32) << 8 | rm_hi_bits) as u16;
3098                        bytes.extend_from_slice(&hw1.to_le_bytes());
3099                        bytes.extend_from_slice(&hw2.to_le_bytes());
3100
3101                        // ITE GE; MOV rd, #1; MOV rd, #0 (GE is !LT)
3102                        bytes.extend_from_slice(&encode_setcond(0xA, rd_bits)); // GE = 0xA
3103                    }
3104
3105                    // Unsigned comparisons - same instruction sequence, different conditions
3106                    Condition::LO => {
3107                        // LO (unsigned LT): CMP lo, SBCS hi, check C=0
3108                        bytes.extend_from_slice(&encode_cmp_reg(rn_lo, rm_lo));
3109                        let rn_hi_bits = reg_to_bits(rn_hi);
3110                        let rm_hi_bits = reg_to_bits(rm_hi);
3111                        let hw1: u16 = (0xEB70 | rn_hi_bits) as u16;
3112                        let hw2: u16 = ((rd_bits as u32) << 8 | rm_hi_bits) as u16;
3113                        bytes.extend_from_slice(&hw1.to_le_bytes());
3114                        bytes.extend_from_slice(&hw2.to_le_bytes());
3115                        bytes.extend_from_slice(&encode_setcond(0x3, rd_bits)); // LO = 0x3 (CC)
3116                    }
3117
3118                    Condition::HI => {
3119                        // HI (unsigned GT): swap operands and check LO
3120                        bytes.extend_from_slice(&encode_cmp_reg(rm_lo, rn_lo));
3121                        let rm_hi_bits = reg_to_bits(rm_hi);
3122                        let rn_hi_bits = reg_to_bits(rn_hi);
3123                        let hw1: u16 = (0xEB70 | rm_hi_bits) as u16;
3124                        let hw2: u16 = ((rd_bits as u32) << 8 | rn_hi_bits) as u16;
3125                        bytes.extend_from_slice(&hw1.to_le_bytes());
3126                        bytes.extend_from_slice(&hw2.to_le_bytes());
3127                        bytes.extend_from_slice(&encode_setcond(0x3, rd_bits)); // LO = 0x3 (CC)
3128                    }
3129
3130                    Condition::LS => {
3131                        // LS (unsigned LE): !(a > b) = !(HI), so do HI and invert
3132                        bytes.extend_from_slice(&encode_cmp_reg(rm_lo, rn_lo));
3133                        let rm_hi_bits = reg_to_bits(rm_hi);
3134                        let rn_hi_bits = reg_to_bits(rn_hi);
3135                        let hw1: u16 = (0xEB70 | rm_hi_bits) as u16;
3136                        let hw2: u16 = ((rd_bits as u32) << 8 | rn_hi_bits) as u16;
3137                        bytes.extend_from_slice(&hw1.to_le_bytes());
3138                        bytes.extend_from_slice(&hw2.to_le_bytes());
3139                        bytes.extend_from_slice(&encode_setcond(0x2, rd_bits)); // HS = 0x2 (CS) = !LO
3140                    }
3141
3142                    Condition::HS => {
3143                        // HS (unsigned GE): !(a < b) = !(LO)
3144                        bytes.extend_from_slice(&encode_cmp_reg(rn_lo, rm_lo));
3145                        let rn_hi_bits = reg_to_bits(rn_hi);
3146                        let rm_hi_bits = reg_to_bits(rm_hi);
3147                        let hw1: u16 = (0xEB70 | rn_hi_bits) as u16;
3148                        let hw2: u16 = ((rd_bits as u32) << 8 | rm_hi_bits) as u16;
3149                        bytes.extend_from_slice(&hw1.to_le_bytes());
3150                        bytes.extend_from_slice(&hw2.to_le_bytes());
3151                        bytes.extend_from_slice(&encode_setcond(0x2, rd_bits)); // HS = 0x2 (CS) = !LO
3152                    }
3153                }
3154
3155                Ok(bytes)
3156            }
3157
3158            // I64SetCondZ: Test if i64 register pair is zero, result 0/1 in rd
3159            // ORR.W rd, rn_lo, rn_hi; CMP rd, #0; ITE EQ; MOV 1; MOV 0
3160            ArmOp::I64SetCondZ { rd, rn_lo, rn_hi } => {
3161                let rd_bits = reg_to_bits(rd);
3162                let rn_lo_bits = reg_to_bits(rn_lo);
3163                let rn_hi_bits = reg_to_bits(rn_hi);
3164                let mut bytes = Vec::new();
3165
3166                // ORR.W rd, rn_lo, rn_hi: EA40 rn_lo | 0000 rd 0000 rn_hi
3167                let hw1: u16 = (0xEA40 | rn_lo_bits) as u16;
3168                let hw2: u16 = ((rd_bits << 8) | rn_hi_bits) as u16;
3169                bytes.extend_from_slice(&hw1.to_le_bytes());
3170                bytes.extend_from_slice(&hw2.to_le_bytes());
3171
3172                // CMP rd, #0 — 16-bit form only for r0-r7 (3-bit rd field);
3173                // high registers take CMP.W (T2: F1B0|rn 0F00|imm8). This was
3174                // H-CODE-9: rd_bits<<8 overflowing the field compared the
3175                // WRONG register. Same hardening as the #311 SetCond fix.
3176                if rd_bits < 8 {
3177                    let cmp_instr: u16 = 0x2800 | ((rd_bits as u16) << 8);
3178                    bytes.extend_from_slice(&cmp_instr.to_le_bytes());
3179                } else {
3180                    let hw1: u16 = 0xF1B0 | (rd_bits as u16);
3181                    let hw2: u16 = 0x0F00;
3182                    bytes.extend_from_slice(&hw1.to_le_bytes());
3183                    bytes.extend_from_slice(&hw2.to_le_bytes());
3184                }
3185
3186                // ITE EQ; MOV rd, #1; MOV rd, #0 (32-bit MOV.W for rd >= R8,
3187                // #311 — see I64SetCond)
3188                let mask = 0xC_u16; // ITE EQ mask: firstcond[0]=0, mask=0xC
3189                let ite_instr: u16 = 0xBF00 | mask;
3190                bytes.extend_from_slice(&ite_instr.to_le_bytes());
3191                if rd_bits < 8 {
3192                    let mov_one: u16 = 0x2001 | ((rd_bits as u16) << 8);
3193                    let mov_zero: u16 = 0x2000 | ((rd_bits as u16) << 8);
3194                    bytes.extend_from_slice(&mov_one.to_le_bytes());
3195                    bytes.extend_from_slice(&mov_zero.to_le_bytes());
3196                } else {
3197                    for imm in [1u16, 0u16] {
3198                        let hw1: u16 = 0xF04F;
3199                        let hw2: u16 = ((rd_bits as u16) << 8) | imm;
3200                        bytes.extend_from_slice(&hw1.to_le_bytes());
3201                        bytes.extend_from_slice(&hw2.to_le_bytes());
3202                    }
3203                }
3204
3205                Ok(bytes)
3206            }
3207
3208            // I64Mul: 64-bit multiply using UMULL + MLA cross products
3209            // Formula: result = (a_lo * b_lo) + ((a_lo * b_hi + a_hi * b_lo) << 32)
3210            // Uses R12 as scratch register
3211            ArmOp::I64Mul {
3212                rd_lo,
3213                rd_hi,
3214                rn_lo,
3215                rn_hi,
3216                rm_lo,
3217                rm_hi,
3218            } => {
3219                let rd_lo_bits = reg_to_bits(rd_lo);
3220                let rd_hi_bits = reg_to_bits(rd_hi);
3221                let rn_lo_bits = reg_to_bits(rn_lo);
3222                let rn_hi_bits = reg_to_bits(rn_hi);
3223                let rm_lo_bits = reg_to_bits(rm_lo);
3224                let rm_hi_bits = reg_to_bits(rm_hi);
3225                let r12: u32 = 12; // IP scratch register
3226                let mut bytes = Vec::new();
3227
3228                // 1. MUL R12, rn_lo, rm_hi  (R12 = a_lo * b_hi)
3229                // Thumb-2 MUL: hw1=0xFB00|Rn, hw2=0xF000|(Rd<<8)|Rm
3230                let hw1: u16 = (0xFB00 | rn_lo_bits) as u16;
3231                let hw2: u16 = (0xF000 | (r12 << 8) | rm_hi_bits) as u16;
3232                bytes.extend_from_slice(&hw1.to_le_bytes());
3233                bytes.extend_from_slice(&hw2.to_le_bytes());
3234
3235                // 2. MLA R12, rn_hi, rm_lo, R12  (R12 += a_hi * b_lo)
3236                // Thumb-2 MLA: hw1=0xFB00|Rn, hw2=(Ra<<12)|(Rd<<8)|Rm
3237                let hw1: u16 = (0xFB00 | rn_hi_bits) as u16;
3238                let hw2: u16 = ((r12 << 12) | (r12 << 8) | rm_lo_bits) as u16;
3239                bytes.extend_from_slice(&hw1.to_le_bytes());
3240                bytes.extend_from_slice(&hw2.to_le_bytes());
3241
3242                // 3. UMULL rd_lo, rd_hi, rn_lo, rm_lo  (rd_lo:rd_hi = a_lo * b_lo)
3243                // Thumb-2 UMULL: hw1=0xFBA0|Rn, hw2=(RdLo<<12)|(RdHi<<8)|Rm
3244                let hw1: u16 = (0xFBA0 | rn_lo_bits) as u16;
3245                let hw2: u16 = ((rd_lo_bits << 12) | (rd_hi_bits << 8) | rm_lo_bits) as u16;
3246                bytes.extend_from_slice(&hw1.to_le_bytes());
3247                bytes.extend_from_slice(&hw2.to_le_bytes());
3248
3249                // 4. ADD rd_hi, R12  (rd_hi += cross products)
3250                // 16-bit high reg ADD: 01000100 D Rm Rdn[2:0]
3251                let d_bit = (rd_hi_bits >> 3) & 1;
3252                let add_instr: u16 =
3253                    (0x4400 | (d_bit << 7) | (r12 << 3) | (rd_hi_bits & 0x7)) as u16;
3254                bytes.extend_from_slice(&add_instr.to_le_bytes());
3255
3256                Ok(bytes)
3257            }
3258
3259            // I64Shl: 64-bit shift left with branch for n<32 vs n>=32
3260            // rm_hi (R3) is used as temp register
3261            ArmOp::I64Shl {
3262                rd_lo,
3263                rd_hi,
3264                rn_lo,
3265                rn_hi,
3266                rm_lo,
3267                rm_hi,
3268            } => {
3269                let rd_lo_bits = reg_to_bits(rd_lo);
3270                let rd_hi_bits = reg_to_bits(rd_hi);
3271                let rn_lo_bits = reg_to_bits(rn_lo);
3272                let rn_hi_bits = reg_to_bits(rn_hi);
3273                let rm_lo_bits = reg_to_bits(rm_lo);
3274                let rm_hi_bits = reg_to_bits(rm_hi); // temp
3275                let mut bytes = Vec::new();
3276
3277                // AND.W rm_lo, rm_lo, #63  (mask shift amount to 6 bits)
3278                let hw1: u16 = (0xF000 | rm_lo_bits) as u16;
3279                let hw2: u16 = ((rm_lo_bits << 8) | 0x3F) as u16;
3280                bytes.extend_from_slice(&hw1.to_le_bytes());
3281                bytes.extend_from_slice(&hw2.to_le_bytes());
3282
3283                // SUBS.W rm_hi, rm_lo, #32  (rm_hi = n-32, sets flags)
3284                let hw1: u16 = (0xF1B0 | rm_lo_bits) as u16;
3285                let hw2: u16 = ((rm_hi_bits << 8) | 0x20) as u16;
3286                bytes.extend_from_slice(&hw1.to_le_bytes());
3287                bytes.extend_from_slice(&hw2.to_le_bytes());
3288
3289                // BPL .large (branch if n >= 32, offset = +10 halfwords)
3290                let bpl: u16 = 0xD50A;
3291                bytes.extend_from_slice(&bpl.to_le_bytes());
3292
3293                // --- Small shift (n < 32) ---
3294                // RSB.W rm_hi, rm_lo, #32  (rm_hi = 32-n)
3295                let hw1: u16 = (0xF1C0 | rm_lo_bits) as u16;
3296                let hw2: u16 = ((rm_hi_bits << 8) | 0x20) as u16;
3297                bytes.extend_from_slice(&hw1.to_le_bytes());
3298                bytes.extend_from_slice(&hw2.to_le_bytes());
3299
3300                // LSR.W rm_hi, rn_lo, rm_hi  (rm_hi = lo >> (32-n), overflow bits)
3301                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3302                let hw2: u16 = (0xF000 | (rm_hi_bits << 8) | rm_hi_bits) as u16;
3303                bytes.extend_from_slice(&hw1.to_le_bytes());
3304                bytes.extend_from_slice(&hw2.to_le_bytes());
3305
3306                // LSL.W rd_hi, rn_hi, rm_lo  (hi <<= n)
3307                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3308                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | rm_lo_bits) as u16;
3309                bytes.extend_from_slice(&hw1.to_le_bytes());
3310                bytes.extend_from_slice(&hw2.to_le_bytes());
3311
3312                // ORR.W rd_hi, rd_hi, rm_hi  (hi |= overflow bits from lo)
3313                let hw1: u16 = (0xEA40 | rd_hi_bits) as u16;
3314                let hw2: u16 = ((rd_hi_bits << 8) | rm_hi_bits) as u16;
3315                bytes.extend_from_slice(&hw1.to_le_bytes());
3316                bytes.extend_from_slice(&hw2.to_le_bytes());
3317
3318                // LSL.W rd_lo, rn_lo, rm_lo  (lo <<= n)
3319                let hw1: u16 = (0xFA00 | rn_lo_bits) as u16;
3320                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | rm_lo_bits) as u16;
3321                bytes.extend_from_slice(&hw1.to_le_bytes());
3322                bytes.extend_from_slice(&hw2.to_le_bytes());
3323
3324                // B .done (skip large shift: +2 halfwords)
3325                let b_done: u16 = 0xE002;
3326                bytes.extend_from_slice(&b_done.to_le_bytes());
3327
3328                // --- Large shift (n >= 32) ---
3329                // LSL.W rd_hi, rn_lo, rm_hi  (hi = lo << (n-32))
3330                let hw1: u16 = (0xFA00 | rn_lo_bits) as u16;
3331                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | rm_hi_bits) as u16;
3332                bytes.extend_from_slice(&hw1.to_le_bytes());
3333                bytes.extend_from_slice(&hw2.to_le_bytes());
3334
3335                // MOV rd_lo, #0
3336                let mov_zero: u16 = 0x2000 | ((rd_lo_bits as u16) << 8);
3337                bytes.extend_from_slice(&mov_zero.to_le_bytes());
3338
3339                Ok(bytes) // Total: 38 bytes
3340            }
3341
3342            // I64ShrU: 64-bit logical shift right with branch for n<32 vs n>=32
3343            ArmOp::I64ShrU {
3344                rd_lo,
3345                rd_hi,
3346                rn_lo,
3347                rn_hi,
3348                rm_lo,
3349                rm_hi,
3350            } => {
3351                let rd_lo_bits = reg_to_bits(rd_lo);
3352                let rd_hi_bits = reg_to_bits(rd_hi);
3353                let rn_lo_bits = reg_to_bits(rn_lo);
3354                let rn_hi_bits = reg_to_bits(rn_hi);
3355                let rm_lo_bits = reg_to_bits(rm_lo);
3356                let rm_hi_bits = reg_to_bits(rm_hi); // temp
3357                let mut bytes = Vec::new();
3358
3359                // AND.W rm_lo, rm_lo, #63
3360                let hw1: u16 = (0xF000 | rm_lo_bits) as u16;
3361                let hw2: u16 = ((rm_lo_bits << 8) | 0x3F) as u16;
3362                bytes.extend_from_slice(&hw1.to_le_bytes());
3363                bytes.extend_from_slice(&hw2.to_le_bytes());
3364
3365                // SUBS.W rm_hi, rm_lo, #32
3366                let hw1: u16 = (0xF1B0 | rm_lo_bits) as u16;
3367                let hw2: u16 = ((rm_hi_bits << 8) | 0x20) as u16;
3368                bytes.extend_from_slice(&hw1.to_le_bytes());
3369                bytes.extend_from_slice(&hw2.to_le_bytes());
3370
3371                // BPL .large (+10 halfwords)
3372                let bpl: u16 = 0xD50A;
3373                bytes.extend_from_slice(&bpl.to_le_bytes());
3374
3375                // --- Small shift (n < 32) ---
3376                // RSB.W rm_hi, rm_lo, #32  (rm_hi = 32-n)
3377                let hw1: u16 = (0xF1C0 | rm_lo_bits) as u16;
3378                let hw2: u16 = ((rm_hi_bits << 8) | 0x20) as u16;
3379                bytes.extend_from_slice(&hw1.to_le_bytes());
3380                bytes.extend_from_slice(&hw2.to_le_bytes());
3381
3382                // LSL.W rm_hi, rn_hi, rm_hi  (rm_hi = hi << (32-n), bits flowing to lo)
3383                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3384                let hw2: u16 = (0xF000 | (rm_hi_bits << 8) | rm_hi_bits) as u16;
3385                bytes.extend_from_slice(&hw1.to_le_bytes());
3386                bytes.extend_from_slice(&hw2.to_le_bytes());
3387
3388                // LSR.W rd_lo, rn_lo, rm_lo  (lo >>= n)
3389                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3390                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | rm_lo_bits) as u16;
3391                bytes.extend_from_slice(&hw1.to_le_bytes());
3392                bytes.extend_from_slice(&hw2.to_le_bytes());
3393
3394                // ORR.W rd_lo, rd_lo, rm_hi  (lo |= overflow from hi)
3395                let hw1: u16 = (0xEA40 | rd_lo_bits) as u16;
3396                let hw2: u16 = ((rd_lo_bits << 8) | rm_hi_bits) as u16;
3397                bytes.extend_from_slice(&hw1.to_le_bytes());
3398                bytes.extend_from_slice(&hw2.to_le_bytes());
3399
3400                // LSR.W rd_hi, rn_hi, rm_lo  (hi >>= n, logical)
3401                let hw1: u16 = (0xFA20 | rn_hi_bits) as u16;
3402                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | rm_lo_bits) as u16;
3403                bytes.extend_from_slice(&hw1.to_le_bytes());
3404                bytes.extend_from_slice(&hw2.to_le_bytes());
3405
3406                // B .done (+2 halfwords)
3407                let b_done: u16 = 0xE002;
3408                bytes.extend_from_slice(&b_done.to_le_bytes());
3409
3410                // --- Large shift (n >= 32) ---
3411                // LSR.W rd_lo, rn_hi, rm_hi  (lo = hi >> (n-32))
3412                let hw1: u16 = (0xFA20 | rn_hi_bits) as u16;
3413                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | rm_hi_bits) as u16;
3414                bytes.extend_from_slice(&hw1.to_le_bytes());
3415                bytes.extend_from_slice(&hw2.to_le_bytes());
3416
3417                // MOV rd_hi, #0
3418                let mov_zero: u16 = 0x2000 | ((rd_hi_bits as u16) << 8);
3419                bytes.extend_from_slice(&mov_zero.to_le_bytes());
3420
3421                Ok(bytes) // Total: 38 bytes
3422            }
3423
3424            // I64ShrS: 64-bit arithmetic shift right with branch for n<32 vs n>=32
3425            ArmOp::I64ShrS {
3426                rd_lo,
3427                rd_hi,
3428                rn_lo,
3429                rn_hi,
3430                rm_lo,
3431                rm_hi,
3432            } => {
3433                let rd_lo_bits = reg_to_bits(rd_lo);
3434                let rd_hi_bits = reg_to_bits(rd_hi);
3435                let rn_lo_bits = reg_to_bits(rn_lo);
3436                let rn_hi_bits = reg_to_bits(rn_hi);
3437                let rm_lo_bits = reg_to_bits(rm_lo);
3438                let rm_hi_bits = reg_to_bits(rm_hi); // temp
3439                let mut bytes = Vec::new();
3440
3441                // AND.W rm_lo, rm_lo, #63
3442                let hw1: u16 = (0xF000 | rm_lo_bits) as u16;
3443                let hw2: u16 = ((rm_lo_bits << 8) | 0x3F) as u16;
3444                bytes.extend_from_slice(&hw1.to_le_bytes());
3445                bytes.extend_from_slice(&hw2.to_le_bytes());
3446
3447                // SUBS.W rm_hi, rm_lo, #32
3448                let hw1: u16 = (0xF1B0 | rm_lo_bits) as u16;
3449                let hw2: u16 = ((rm_hi_bits << 8) | 0x20) as u16;
3450                bytes.extend_from_slice(&hw1.to_le_bytes());
3451                bytes.extend_from_slice(&hw2.to_le_bytes());
3452
3453                // BPL .large (+10 halfwords)
3454                let bpl: u16 = 0xD50A;
3455                bytes.extend_from_slice(&bpl.to_le_bytes());
3456
3457                // --- Small shift (n < 32) ---
3458                // RSB.W rm_hi, rm_lo, #32
3459                let hw1: u16 = (0xF1C0 | rm_lo_bits) as u16;
3460                let hw2: u16 = ((rm_hi_bits << 8) | 0x20) as u16;
3461                bytes.extend_from_slice(&hw1.to_le_bytes());
3462                bytes.extend_from_slice(&hw2.to_le_bytes());
3463
3464                // LSL.W rm_hi, rn_hi, rm_hi  (rm_hi = hi << (32-n), bits flowing to lo)
3465                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3466                let hw2: u16 = (0xF000 | (rm_hi_bits << 8) | rm_hi_bits) as u16;
3467                bytes.extend_from_slice(&hw1.to_le_bytes());
3468                bytes.extend_from_slice(&hw2.to_le_bytes());
3469
3470                // LSR.W rd_lo, rn_lo, rm_lo  (lo >>= n, logical for lo word)
3471                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3472                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | rm_lo_bits) as u16;
3473                bytes.extend_from_slice(&hw1.to_le_bytes());
3474                bytes.extend_from_slice(&hw2.to_le_bytes());
3475
3476                // ORR.W rd_lo, rd_lo, rm_hi  (lo |= overflow from hi)
3477                let hw1: u16 = (0xEA40 | rd_lo_bits) as u16;
3478                let hw2: u16 = ((rd_lo_bits << 8) | rm_hi_bits) as u16;
3479                bytes.extend_from_slice(&hw1.to_le_bytes());
3480                bytes.extend_from_slice(&hw2.to_le_bytes());
3481
3482                // ASR.W rd_hi, rn_hi, rm_lo  (hi >>= n, arithmetic/sign-extending)
3483                let hw1: u16 = (0xFA40 | rn_hi_bits) as u16;
3484                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | rm_lo_bits) as u16;
3485                bytes.extend_from_slice(&hw1.to_le_bytes());
3486                bytes.extend_from_slice(&hw2.to_le_bytes());
3487
3488                // B .done (+3 halfwords, large shift is 8 bytes)
3489                let b_done: u16 = 0xE003;
3490                bytes.extend_from_slice(&b_done.to_le_bytes());
3491
3492                // --- Large shift (n >= 32) ---
3493                // ASR.W rd_lo, rn_hi, rm_hi  (lo = hi >>> (n-32))
3494                let hw1: u16 = (0xFA40 | rn_hi_bits) as u16;
3495                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | rm_hi_bits) as u16;
3496                bytes.extend_from_slice(&hw1.to_le_bytes());
3497                bytes.extend_from_slice(&hw2.to_le_bytes());
3498
3499                // ASR.W rd_hi, rn_hi, #31  (hi = sign extension, all 0s or all 1s)
3500                // Thumb-2 ASR immediate: hw1=0xEA4F, hw2=imm3:Rd:imm2:10:Rm
3501                // imm5=31=11111 → imm3=111, imm2=11
3502                let hw1: u16 = 0xEA4F;
3503                let hw2: u16 = (0x7000 | (rd_hi_bits << 8) | 0x00E0 | rn_hi_bits) as u16;
3504                bytes.extend_from_slice(&hw1.to_le_bytes());
3505                bytes.extend_from_slice(&hw2.to_le_bytes());
3506
3507                Ok(bytes) // Total: 40 bytes
3508            }
3509
3510            // I64Rotl: 64-bit rotate left
3511            // For n < 32: new_hi = (hi << n) | (lo >> (32-n)), new_lo = (lo << n) | (hi >> (32-n))
3512            // For n >= 32: same formula but with lo/hi conceptually swapped, shift by (n-32)
3513            // Uses R4 (saved/restored) and R12 as scratch
3514            ArmOp::I64Rotl {
3515                rdlo,
3516                rdhi,
3517                rnlo,
3518                rnhi,
3519                shift,
3520            } => {
3521                let rd_lo_bits = reg_to_bits(rdlo);
3522                let rd_hi_bits = reg_to_bits(rdhi);
3523                let rn_lo_bits = reg_to_bits(rnlo);
3524                let rn_hi_bits = reg_to_bits(rnhi);
3525                let shift_bits = reg_to_bits(shift);
3526                let r12: u32 = 12; // IP scratch
3527                let r3: u32 = 3; // Scratch (high word of shift amount, unused)
3528                let r4: u32 = 4; // Scratch (saved/restored)
3529                let mut bytes = Vec::new();
3530
3531                // PUSH {R4}
3532                bytes.extend_from_slice(&0xB410u16.to_le_bytes());
3533
3534                // AND.W shift, shift, #63 (mask to 6 bits)
3535                let hw1: u16 = (0xF000 | shift_bits) as u16;
3536                let hw2: u16 = ((shift_bits << 8) | 0x3F) as u16;
3537                bytes.extend_from_slice(&hw1.to_le_bytes());
3538                bytes.extend_from_slice(&hw2.to_le_bytes());
3539
3540                // SUBS.W R3, shift, #32 (R3 = n-32, sets flags)
3541                let hw1: u16 = (0xF1B0 | shift_bits) as u16;
3542                let hw2: u16 = ((r3 << 8) | 0x20) as u16;
3543                bytes.extend_from_slice(&hw1.to_le_bytes());
3544                bytes.extend_from_slice(&hw2.to_le_bytes());
3545
3546                // BPL .large (branch if n >= 32, offset = +14 halfwords)
3547                let bpl: u16 = 0xD50E;
3548                bytes.extend_from_slice(&bpl.to_le_bytes());
3549
3550                // === Small rotation (n < 32) ===
3551                // RSB.W R3, shift, #32 (R3 = 32-n)
3552                let hw1: u16 = (0xF1C0 | shift_bits) as u16;
3553                let hw2: u16 = ((r3 << 8) | 0x20) as u16;
3554                bytes.extend_from_slice(&hw1.to_le_bytes());
3555                bytes.extend_from_slice(&hw2.to_le_bytes());
3556
3557                // LSR.W R4, rn_lo, R3 (R4 = lo >> (32-n), will go to new_hi)
3558                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3559                let hw2: u16 = (0xF000 | (r4 << 8) | r3) as u16;
3560                bytes.extend_from_slice(&hw1.to_le_bytes());
3561                bytes.extend_from_slice(&hw2.to_le_bytes());
3562
3563                // LSR.W R12, rn_hi, R3 (R12 = hi >> (32-n), will go to new_lo)
3564                let hw1: u16 = (0xFA20 | rn_hi_bits) as u16;
3565                let hw2: u16 = (0xF000 | (r12 << 8) | r3) as u16;
3566                bytes.extend_from_slice(&hw1.to_le_bytes());
3567                bytes.extend_from_slice(&hw2.to_le_bytes());
3568
3569                // LSL.W rd_hi, rn_hi, shift (rd_hi = hi << n)
3570                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3571                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | shift_bits) as u16;
3572                bytes.extend_from_slice(&hw1.to_le_bytes());
3573                bytes.extend_from_slice(&hw2.to_le_bytes());
3574
3575                // ORR.W rd_hi, rd_hi, R4 (rd_hi = (hi << n) | (lo >> (32-n)))
3576                let hw1: u16 = (0xEA40 | rd_hi_bits) as u16;
3577                let hw2: u16 = ((rd_hi_bits << 8) | r4) as u16;
3578                bytes.extend_from_slice(&hw1.to_le_bytes());
3579                bytes.extend_from_slice(&hw2.to_le_bytes());
3580
3581                // LSL.W rd_lo, rn_lo, shift (rd_lo = lo << n)
3582                let hw1: u16 = (0xFA00 | rn_lo_bits) as u16;
3583                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | shift_bits) as u16;
3584                bytes.extend_from_slice(&hw1.to_le_bytes());
3585                bytes.extend_from_slice(&hw2.to_le_bytes());
3586
3587                // ORR.W rd_lo, rd_lo, R12 (rd_lo = (lo << n) | (hi >> (32-n)))
3588                let hw1: u16 = (0xEA40 | rd_lo_bits) as u16;
3589                let hw2: u16 = ((rd_lo_bits << 8) | r12) as u16;
3590                bytes.extend_from_slice(&hw1.to_le_bytes());
3591                bytes.extend_from_slice(&hw2.to_le_bytes());
3592
3593                // B .done (skip large block, offset = +14 halfwords)
3594                let b_done: u16 = 0xE00E;
3595                bytes.extend_from_slice(&b_done.to_le_bytes());
3596
3597                // === Large rotation (n >= 32) ===
3598                // R3 already has n-32 from the SUBS
3599                // RSB.W R4, R3, #32 (R4 = 32-(n-32) = 64-n)
3600                let hw1: u16 = (0xF1C0 | r3) as u16;
3601                let hw2: u16 = ((r4 << 8) | 0x20) as u16;
3602                bytes.extend_from_slice(&hw1.to_le_bytes());
3603                bytes.extend_from_slice(&hw2.to_le_bytes());
3604
3605                // LSR.W R12, rn_hi, R4 (R12 = hi >> (64-n), goes to new_hi low bits)
3606                let hw1: u16 = (0xFA20 | rn_hi_bits) as u16;
3607                let hw2: u16 = (0xF000 | (r12 << 8) | r4) as u16;
3608                bytes.extend_from_slice(&hw1.to_le_bytes());
3609                bytes.extend_from_slice(&hw2.to_le_bytes());
3610
3611                // LSR.W R4, rn_lo, R4 (R4 = lo >> (64-n), goes to new_lo low bits)
3612                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3613                let hw2: u16 = (0xF000 | (r4 << 8) | r4) as u16;
3614                bytes.extend_from_slice(&hw1.to_le_bytes());
3615                bytes.extend_from_slice(&hw2.to_le_bytes());
3616
3617                // LSL.W shift, rn_lo, R3 (shift = lo << (n-32), new_hi high bits)
3618                let hw1: u16 = (0xFA00 | rn_lo_bits) as u16;
3619                let hw2: u16 = (0xF000 | (shift_bits << 8) | r3) as u16;
3620                bytes.extend_from_slice(&hw1.to_le_bytes());
3621                bytes.extend_from_slice(&hw2.to_le_bytes());
3622
3623                // ORR.W shift, shift, R12 (shift = (lo << (n-32)) | (hi >> (64-n)) = new_hi)
3624                let hw1: u16 = (0xEA40 | shift_bits) as u16;
3625                let hw2: u16 = ((shift_bits << 8) | r12) as u16;
3626                bytes.extend_from_slice(&hw1.to_le_bytes());
3627                bytes.extend_from_slice(&hw2.to_le_bytes());
3628
3629                // LSL.W rd_lo, rn_hi, R3 (rd_lo = hi << (n-32), new_lo high bits)
3630                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3631                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | r3) as u16;
3632                bytes.extend_from_slice(&hw1.to_le_bytes());
3633                bytes.extend_from_slice(&hw2.to_le_bytes());
3634
3635                // ORR.W rd_lo, rd_lo, R4 (rd_lo = (hi << (n-32)) | (lo >> (64-n)) = new_lo)
3636                let hw1: u16 = (0xEA40 | rd_lo_bits) as u16;
3637                let hw2: u16 = ((rd_lo_bits << 8) | r4) as u16;
3638                bytes.extend_from_slice(&hw1.to_le_bytes());
3639                bytes.extend_from_slice(&hw2.to_le_bytes());
3640
3641                // MOV rd_hi, shift (rd_hi = new_hi)
3642                let d_bit = (rd_hi_bits >> 3) & 1;
3643                let mov_instr: u16 =
3644                    (0x4600 | (d_bit << 7) | (shift_bits << 3) | (rd_hi_bits & 0x7)) as u16;
3645                bytes.extend_from_slice(&mov_instr.to_le_bytes());
3646
3647                // POP {R4}
3648                bytes.extend_from_slice(&0xBC10u16.to_le_bytes());
3649
3650                Ok(bytes) // Total: 74 bytes
3651            }
3652
3653            // I64Rotr: 64-bit rotate right
3654            // rotr(x, n) = rotl(x, 64-n)
3655            // For n < 32: new_lo = (lo >> n) | (hi << (32-n)), new_hi = (hi >> n) | (lo << (32-n))
3656            // For n >= 32: same formula but with lo/hi swapped, shift by (n-32)
3657            ArmOp::I64Rotr {
3658                rdlo,
3659                rdhi,
3660                rnlo,
3661                rnhi,
3662                shift,
3663            } => {
3664                let rd_lo_bits = reg_to_bits(rdlo);
3665                let rd_hi_bits = reg_to_bits(rdhi);
3666                let rn_lo_bits = reg_to_bits(rnlo);
3667                let rn_hi_bits = reg_to_bits(rnhi);
3668                let shift_bits = reg_to_bits(shift);
3669                let r12: u32 = 12;
3670                let r3: u32 = 3;
3671                let r4: u32 = 4;
3672                let mut bytes = Vec::new();
3673
3674                // PUSH {R4}
3675                bytes.extend_from_slice(&0xB410u16.to_le_bytes());
3676
3677                // AND.W shift, shift, #63
3678                let hw1: u16 = (0xF000 | shift_bits) as u16;
3679                let hw2: u16 = ((shift_bits << 8) | 0x3F) as u16;
3680                bytes.extend_from_slice(&hw1.to_le_bytes());
3681                bytes.extend_from_slice(&hw2.to_le_bytes());
3682
3683                // SUBS.W R3, shift, #32
3684                let hw1: u16 = (0xF1B0 | shift_bits) as u16;
3685                let hw2: u16 = ((r3 << 8) | 0x20) as u16;
3686                bytes.extend_from_slice(&hw1.to_le_bytes());
3687                bytes.extend_from_slice(&hw2.to_le_bytes());
3688
3689                // BPL .large (+14 halfwords)
3690                let bpl: u16 = 0xD50E;
3691                bytes.extend_from_slice(&bpl.to_le_bytes());
3692
3693                // === Small rotation (n < 32) ===
3694                // RSB.W R3, shift, #32 (R3 = 32-n)
3695                let hw1: u16 = (0xF1C0 | shift_bits) as u16;
3696                let hw2: u16 = ((r3 << 8) | 0x20) as u16;
3697                bytes.extend_from_slice(&hw1.to_le_bytes());
3698                bytes.extend_from_slice(&hw2.to_le_bytes());
3699
3700                // LSL.W R4, rn_hi, R3 (R4 = hi << (32-n), will go to new_lo)
3701                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3702                let hw2: u16 = (0xF000 | (r4 << 8) | r3) as u16;
3703                bytes.extend_from_slice(&hw1.to_le_bytes());
3704                bytes.extend_from_slice(&hw2.to_le_bytes());
3705
3706                // LSL.W R12, rn_lo, R3 (R12 = lo << (32-n), will go to new_hi)
3707                let hw1: u16 = (0xFA00 | rn_lo_bits) as u16;
3708                let hw2: u16 = (0xF000 | (r12 << 8) | r3) as u16;
3709                bytes.extend_from_slice(&hw1.to_le_bytes());
3710                bytes.extend_from_slice(&hw2.to_le_bytes());
3711
3712                // LSR.W rd_lo, rn_lo, shift (rd_lo = lo >> n)
3713                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3714                let hw2: u16 = (0xF000 | (rd_lo_bits << 8) | shift_bits) as u16;
3715                bytes.extend_from_slice(&hw1.to_le_bytes());
3716                bytes.extend_from_slice(&hw2.to_le_bytes());
3717
3718                // ORR.W rd_lo, rd_lo, R4 (rd_lo = (lo >> n) | (hi << (32-n)))
3719                let hw1: u16 = (0xEA40 | rd_lo_bits) as u16;
3720                let hw2: u16 = ((rd_lo_bits << 8) | r4) as u16;
3721                bytes.extend_from_slice(&hw1.to_le_bytes());
3722                bytes.extend_from_slice(&hw2.to_le_bytes());
3723
3724                // LSR.W rd_hi, rn_hi, shift (rd_hi = hi >> n)
3725                let hw1: u16 = (0xFA20 | rn_hi_bits) as u16;
3726                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | shift_bits) as u16;
3727                bytes.extend_from_slice(&hw1.to_le_bytes());
3728                bytes.extend_from_slice(&hw2.to_le_bytes());
3729
3730                // ORR.W rd_hi, rd_hi, R12 (rd_hi = (hi >> n) | (lo << (32-n)))
3731                let hw1: u16 = (0xEA40 | rd_hi_bits) as u16;
3732                let hw2: u16 = ((rd_hi_bits << 8) | r12) as u16;
3733                bytes.extend_from_slice(&hw1.to_le_bytes());
3734                bytes.extend_from_slice(&hw2.to_le_bytes());
3735
3736                // B .done (+14 halfwords)
3737                let b_done: u16 = 0xE00E;
3738                bytes.extend_from_slice(&b_done.to_le_bytes());
3739
3740                // === Large rotation (n >= 32) ===
3741                // RSB.W R4, R3, #32 (R4 = 64-n)
3742                let hw1: u16 = (0xF1C0 | r3) as u16;
3743                let hw2: u16 = ((r4 << 8) | 0x20) as u16;
3744                bytes.extend_from_slice(&hw1.to_le_bytes());
3745                bytes.extend_from_slice(&hw2.to_le_bytes());
3746
3747                // LSL.W R12, rn_lo, R4 (R12 = lo << (64-n), goes to new_lo low bits)
3748                let hw1: u16 = (0xFA00 | rn_lo_bits) as u16;
3749                let hw2: u16 = (0xF000 | (r12 << 8) | r4) as u16;
3750                bytes.extend_from_slice(&hw1.to_le_bytes());
3751                bytes.extend_from_slice(&hw2.to_le_bytes());
3752
3753                // LSL.W R4, rn_hi, R4 (R4 = hi << (64-n), goes to new_hi low bits)
3754                let hw1: u16 = (0xFA00 | rn_hi_bits) as u16;
3755                let hw2: u16 = (0xF000 | (r4 << 8) | r4) as u16;
3756                bytes.extend_from_slice(&hw1.to_le_bytes());
3757                bytes.extend_from_slice(&hw2.to_le_bytes());
3758
3759                // LSR.W shift, rn_hi, R3 (shift = hi >> (n-32), new_lo high bits)
3760                let hw1: u16 = (0xFA20 | rn_hi_bits) as u16;
3761                let hw2: u16 = (0xF000 | (shift_bits << 8) | r3) as u16;
3762                bytes.extend_from_slice(&hw1.to_le_bytes());
3763                bytes.extend_from_slice(&hw2.to_le_bytes());
3764
3765                // ORR.W shift, shift, R12 (shift = (hi >> (n-32)) | (lo << (64-n)) = new_lo)
3766                let hw1: u16 = (0xEA40 | shift_bits) as u16;
3767                let hw2: u16 = ((shift_bits << 8) | r12) as u16;
3768                bytes.extend_from_slice(&hw1.to_le_bytes());
3769                bytes.extend_from_slice(&hw2.to_le_bytes());
3770
3771                // LSR.W rd_hi, rn_lo, R3 (rd_hi = lo >> (n-32), new_hi high bits)
3772                let hw1: u16 = (0xFA20 | rn_lo_bits) as u16;
3773                let hw2: u16 = (0xF000 | (rd_hi_bits << 8) | r3) as u16;
3774                bytes.extend_from_slice(&hw1.to_le_bytes());
3775                bytes.extend_from_slice(&hw2.to_le_bytes());
3776
3777                // ORR.W rd_hi, rd_hi, R4 (rd_hi = (lo >> (n-32)) | (hi << (64-n)) = new_hi)
3778                let hw1: u16 = (0xEA40 | rd_hi_bits) as u16;
3779                let hw2: u16 = ((rd_hi_bits << 8) | r4) as u16;
3780                bytes.extend_from_slice(&hw1.to_le_bytes());
3781                bytes.extend_from_slice(&hw2.to_le_bytes());
3782
3783                // MOV rd_lo, shift (rd_lo = new_lo)
3784                let d_bit = (rd_lo_bits >> 3) & 1;
3785                let mov_instr: u16 =
3786                    (0x4600 | (d_bit << 7) | (shift_bits << 3) | (rd_lo_bits & 0x7)) as u16;
3787                bytes.extend_from_slice(&mov_instr.to_le_bytes());
3788
3789                // POP {R4}
3790                bytes.extend_from_slice(&0xBC10u16.to_le_bytes());
3791
3792                Ok(bytes) // Total: 74 bytes
3793            }
3794
3795            // I64Clz: Count leading zeros in 64-bit value
3796            // If hi != 0: result = CLZ(hi)
3797            // If hi == 0: result = 32 + CLZ(lo)
3798            //
3799            // Layout (using CMP+BNE approach for consistency):
3800            // 0: CMP.W rnhi, #0 (4 bytes)
3801            // 4: BEQ .hi_zero (2 bytes) - branch forward to offset 14
3802            // 6: CLZ.W rd, rnhi (4 bytes)
3803            // 10: B .done (2 bytes) - branch forward to offset 22
3804            // 12: NOP (2 bytes) - padding for alignment
3805            // 14: .hi_zero: CLZ.W rd, rnlo (4 bytes)
3806            // 18: ADD.W rd, rd, #32 (4 bytes)
3807            // 22: .done
3808            ArmOp::I64Clz { rd, rnlo, rnhi } => {
3809                let rd_bits = reg_to_bits(rd);
3810                let rn_lo_bits = reg_to_bits(rnlo);
3811                let rn_hi_bits = reg_to_bits(rnhi);
3812                let mut bytes = Vec::new();
3813
3814                // CMP.W rnhi, #0 (4 bytes at offset 0)
3815                let hw1: u16 = (0xF1B0 | rn_hi_bits) as u16;
3816                let hw2: u16 = 0x0F00;
3817                bytes.extend_from_slice(&hw1.to_le_bytes());
3818                bytes.extend_from_slice(&hw2.to_le_bytes());
3819
3820                // BEQ .hi_zero (2 bytes at offset 4)
3821                // PC = 4 + 4 = 8, target = 14, offset = 6, imm8 = 3
3822                let beq: u16 = 0xD003;
3823                bytes.extend_from_slice(&beq.to_le_bytes());
3824
3825                // CLZ.W rd, rnhi (4 bytes at offset 6)
3826                // CLZ T1: hw1 = 0xFAB<Rm>, hw2 = 0xF<Rd>8<Rm>
3827                let hw1: u16 = (0xFAB0 | rn_hi_bits) as u16;
3828                let hw2: u16 = (0xF080 | (rd_bits << 8) | rn_hi_bits) as u16;
3829                bytes.extend_from_slice(&hw1.to_le_bytes());
3830                bytes.extend_from_slice(&hw2.to_le_bytes());
3831
3832                // B .done (2 bytes at offset 10)
3833                // PC = 10 + 4 = 14, target = 22, offset = 8, imm11 = 4
3834                let b_done: u16 = 0xE004;
3835                bytes.extend_from_slice(&b_done.to_le_bytes());
3836
3837                // NOP (2 bytes at offset 12) - padding
3838                bytes.extend_from_slice(&0xBF00u16.to_le_bytes());
3839
3840                // .hi_zero: (offset 14)
3841                // CLZ.W rd, rnlo (4 bytes)
3842                // CLZ T1: hw1 = 0xFAB<Rm>, hw2 = 0xF<Rd>8<Rm>
3843                let hw1: u16 = (0xFAB0 | rn_lo_bits) as u16;
3844                let hw2: u16 = (0xF080 | (rd_bits << 8) | rn_lo_bits) as u16;
3845                bytes.extend_from_slice(&hw1.to_le_bytes());
3846                bytes.extend_from_slice(&hw2.to_le_bytes());
3847
3848                // ADD.W rd, rd, #32 (4 bytes at offset 18)
3849                let hw1: u16 = (0xF100 | rd_bits) as u16;
3850                let hw2: u16 = ((rd_bits << 8) | 0x20) as u16;
3851                bytes.extend_from_slice(&hw1.to_le_bytes());
3852                bytes.extend_from_slice(&hw2.to_le_bytes());
3853
3854                // .done: (offset 22)
3855                // i64.clz returns i64, so clear high word: MOV rnhi, #0 (2 bytes)
3856                // MOVS Rn, #0: 0010 0 Rn 00000000
3857                let mov0: u16 = (0x2000 | (rn_hi_bits << 8)) as u16;
3858                bytes.extend_from_slice(&mov0.to_le_bytes());
3859
3860                Ok(bytes)
3861            }
3862
3863            // I64Ctz: Count trailing zeros in 64-bit value
3864            // If lo != 0: result = CTZ(lo) = CLZ(RBIT(lo))
3865            // If lo == 0: result = 32 + CTZ(hi) = 32 + CLZ(RBIT(hi))
3866            //
3867            // Layout:
3868            // 0: CMP.W rnlo, #0 (4 bytes)
3869            // 4: BEQ .lo_zero (2 bytes) - branch to offset 18
3870            // 6: RBIT.W rd, rnlo (4 bytes)
3871            // 10: CLZ.W rd, rd (4 bytes)
3872            // 14: B .done (2 bytes) - branch to offset 30
3873            // 16: NOP (2 bytes) - padding
3874            // 18: .lo_zero: RBIT.W rd, rnhi (4 bytes)
3875            // 22: CLZ.W rd, rd (4 bytes)
3876            // 26: ADD.W rd, rd, #32 (4 bytes)
3877            // 30: .done
3878            ArmOp::I64Ctz { rd, rnlo, rnhi } => {
3879                let rd_bits = reg_to_bits(rd);
3880                let rn_lo_bits = reg_to_bits(rnlo);
3881                let rn_hi_bits = reg_to_bits(rnhi);
3882                let mut bytes = Vec::new();
3883
3884                // CMP.W rnlo, #0 (4 bytes at offset 0)
3885                let hw1: u16 = (0xF1B0 | rn_lo_bits) as u16;
3886                let hw2: u16 = 0x0F00;
3887                bytes.extend_from_slice(&hw1.to_le_bytes());
3888                bytes.extend_from_slice(&hw2.to_le_bytes());
3889
3890                // BEQ .lo_zero (2 bytes at offset 4)
3891                // PC = 4 + 4 = 8, target = 18, offset = 10, imm8 = 5
3892                let beq: u16 = 0xD005;
3893                bytes.extend_from_slice(&beq.to_le_bytes());
3894
3895                // RBIT.W rd, rnlo (4 bytes at offset 6)
3896                // RBIT T1: hw1 = 0xFA9<Rm>, hw2 = 0xF<Rd>A<Rm>
3897                let hw1: u16 = (0xFA90 | rn_lo_bits) as u16;
3898                let hw2: u16 = (0xF0A0 | (rd_bits << 8) | rn_lo_bits) as u16;
3899                bytes.extend_from_slice(&hw1.to_le_bytes());
3900                bytes.extend_from_slice(&hw2.to_le_bytes());
3901
3902                // CLZ.W rd, rd (4 bytes at offset 10)
3903                // CLZ T1: hw1 = 0xFAB<Rm>, hw2 = 0xF<Rd>8<Rm>
3904                let hw1: u16 = (0xFAB0 | rd_bits) as u16;
3905                let hw2: u16 = (0xF080 | (rd_bits << 8) | rd_bits) as u16;
3906                bytes.extend_from_slice(&hw1.to_le_bytes());
3907                bytes.extend_from_slice(&hw2.to_le_bytes());
3908
3909                // B .done (2 bytes at offset 14)
3910                // PC = 14 + 4 = 18, target = 30, offset = 12, imm11 = 6
3911                let b_done: u16 = 0xE006;
3912                bytes.extend_from_slice(&b_done.to_le_bytes());
3913
3914                // NOP (2 bytes at offset 16) - padding
3915                bytes.extend_from_slice(&0xBF00u16.to_le_bytes());
3916
3917                // .lo_zero: (offset 18)
3918                // RBIT.W rd, rnhi (4 bytes)
3919                // RBIT T1: hw1 = 0xFA9<Rm>, hw2 = 0xF<Rd>A<Rm>
3920                let hw1: u16 = (0xFA90 | rn_hi_bits) as u16;
3921                let hw2: u16 = (0xF0A0 | (rd_bits << 8) | rn_hi_bits) as u16;
3922                bytes.extend_from_slice(&hw1.to_le_bytes());
3923                bytes.extend_from_slice(&hw2.to_le_bytes());
3924
3925                // CLZ.W rd, rd (4 bytes at offset 22)
3926                // CLZ T1: hw1 = 0xFAB<Rm>, hw2 = 0xF<Rd>8<Rm>
3927                let hw1: u16 = (0xFAB0 | rd_bits) as u16;
3928                let hw2: u16 = (0xF080 | (rd_bits << 8) | rd_bits) as u16;
3929                bytes.extend_from_slice(&hw1.to_le_bytes());
3930                bytes.extend_from_slice(&hw2.to_le_bytes());
3931
3932                // ADD.W rd, rd, #32 (4 bytes at offset 26)
3933                let hw1: u16 = (0xF100 | rd_bits) as u16;
3934                let hw2: u16 = ((rd_bits << 8) | 0x20) as u16;
3935                bytes.extend_from_slice(&hw1.to_le_bytes());
3936                bytes.extend_from_slice(&hw2.to_le_bytes());
3937
3938                // .done: (offset 30)
3939                // i64.ctz returns i64, so clear high word: MOV rnhi, #0 (2 bytes)
3940                let mov0: u16 = (0x2000 | (rn_hi_bits << 8)) as u16;
3941                bytes.extend_from_slice(&mov0.to_le_bytes());
3942
3943                Ok(bytes)
3944            }
3945
3946            // I64Popcnt: Population count of 64-bit value
3947            // result = POPCNT(lo) + POPCNT(hi)
3948            // Using SIMD-style parallel bit counting algorithm
3949            ArmOp::I64Popcnt { rd, rnlo, rnhi } => {
3950                let rd_bits = reg_to_bits(rd);
3951                let rn_lo_bits = reg_to_bits(rnlo);
3952                let rn_hi_bits = reg_to_bits(rnhi);
3953                let r12: u32 = 12; // IP scratch
3954                let r3: u32 = 3; // Scratch for hi popcnt result
3955                let mut bytes = Vec::new();
3956
3957                // PUSH {R3, R4, R5} - save scratch registers
3958                bytes.extend_from_slice(&0xB438u16.to_le_bytes());
3959
3960                // Strategy: compute popcnt(lo) -> R4, popcnt(hi) -> R5, add them -> rd
3961                // Using lookup table approach for each byte would be too large
3962                // Using shift-and-add approach instead
3963
3964                // For simplicity and correctness, use the efficient parallel algorithm
3965                // but implement it as a series of inline operations
3966
3967                // MOV R4, rnlo
3968                let d_bit: u32 = 0; // R4 < 8, so high bit is 0
3969                let mov: u16 = (0x4600 | (d_bit << 7) | (rn_lo_bits << 3) | (4 & 0x7)) as u16;
3970                bytes.extend_from_slice(&mov.to_le_bytes());
3971
3972                // MOV R5, rnhi
3973                let d_bit: u32 = 0; // R5 < 8, so high bit is 0
3974                let mov: u16 = (0x4600 | (d_bit << 7) | (rn_hi_bits << 3) | (5 & 0x7)) as u16;
3975                bytes.extend_from_slice(&mov.to_le_bytes());
3976
3977                // --- POPCNT for R4 (lo word) ---
3978                // Step 1: x = x - ((x >> 1) & 0x55555555)
3979                // LSR.W R12, R4, #1
3980                let hw1: u16 = 0xEA4F;
3981                let hw2: u16 = ((r12 << 8) | 0x50 | 4) as u16;
3982                bytes.extend_from_slice(&hw1.to_le_bytes());
3983                bytes.extend_from_slice(&hw2.to_le_bytes());
3984
3985                // Load 0x55555555 into R3 using MOVW/MOVT
3986                // MOVW R3, #0x5555
3987                bytes.extend_from_slice(&0xF245u16.to_le_bytes());
3988                bytes.extend_from_slice(&0x5355u16.to_le_bytes());
3989                // MOVT R3, #0x5555
3990                bytes.extend_from_slice(&0xF2C5u16.to_le_bytes());
3991                bytes.extend_from_slice(&0x5355u16.to_le_bytes());
3992
3993                // AND.W R12, R12, R3
3994                let hw1: u16 = (0xEA00 | r12) as u16;
3995                let hw2: u16 = ((r12 << 8) | r3) as u16;
3996                bytes.extend_from_slice(&hw1.to_le_bytes());
3997                bytes.extend_from_slice(&hw2.to_le_bytes());
3998
3999                // SUB.W R4, R4, R12
4000                let hw1: u16 = (0xEBA0 | 4) as u16;
4001                let hw2: u16 = ((4 << 8) | r12) as u16;
4002                bytes.extend_from_slice(&hw1.to_le_bytes());
4003                bytes.extend_from_slice(&hw2.to_le_bytes());
4004
4005                // Step 2: x = (x & 0x33333333) + ((x >> 2) & 0x33333333)
4006                // Load 0x33333333 into R3
4007                // MOVW R3, #0x3333
4008                bytes.extend_from_slice(&0xF243u16.to_le_bytes());
4009                bytes.extend_from_slice(&0x3333u16.to_le_bytes());
4010                // MOVT R3, #0x3333
4011                bytes.extend_from_slice(&0xF2C3u16.to_le_bytes());
4012                bytes.extend_from_slice(&0x3333u16.to_le_bytes());
4013
4014                // AND.W R12, R4, R3
4015                let hw1: u16 = (0xEA00 | 4) as u16;
4016                let hw2: u16 = ((r12 << 8) | r3) as u16;
4017                bytes.extend_from_slice(&hw1.to_le_bytes());
4018                bytes.extend_from_slice(&hw2.to_le_bytes());
4019
4020                // LSR.W R4, R4, #2
4021                let hw1: u16 = 0xEA4F;
4022                let hw2: u16 = ((4 << 8) | 0x90 | 4) as u16;
4023                bytes.extend_from_slice(&hw1.to_le_bytes());
4024                bytes.extend_from_slice(&hw2.to_le_bytes());
4025
4026                // AND.W R4, R4, R3
4027                let hw1: u16 = (0xEA00 | 4) as u16;
4028                let hw2: u16 = ((4 << 8) | r3) as u16;
4029                bytes.extend_from_slice(&hw1.to_le_bytes());
4030                bytes.extend_from_slice(&hw2.to_le_bytes());
4031
4032                // ADD.W R4, R4, R12
4033                let hw1: u16 = (0xEB00 | 4) as u16;
4034                let hw2: u16 = ((4 << 8) | r12) as u16;
4035                bytes.extend_from_slice(&hw1.to_le_bytes());
4036                bytes.extend_from_slice(&hw2.to_le_bytes());
4037
4038                // Step 3: x = (x + (x >> 4)) & 0x0F0F0F0F
4039                // LSR.W R12, R4, #4
4040                // hw2 = (imm3 << 12) | (Rd << 8) | (imm2 << 6) | (type << 4) | Rm
4041                // imm5=4=00100 → imm3=1, imm2=0, type=01(LSR)
4042                let hw1: u16 = 0xEA4F;
4043                let hw2: u16 = (0x1000 | (r12 << 8) | 0x10 | 4) as u16;
4044                bytes.extend_from_slice(&hw1.to_le_bytes());
4045                bytes.extend_from_slice(&hw2.to_le_bytes());
4046
4047                // ADD.W R4, R4, R12
4048                let hw1: u16 = (0xEB00 | 4) as u16;
4049                let hw2: u16 = ((4 << 8) | r12) as u16;
4050                bytes.extend_from_slice(&hw1.to_le_bytes());
4051                bytes.extend_from_slice(&hw2.to_le_bytes());
4052
4053                // Load 0x0F0F0F0F into R3
4054                // MOVW R3, #0x0F0F (imm4=0, i=1, imm3=7, imm8=0x0F)
4055                // hw1 = 11110 1 10 0100 0000 = 0xF640
4056                // hw2 = 0 111 0011 00001111 = 0x730F
4057                bytes.extend_from_slice(&0xF640u16.to_le_bytes());
4058                bytes.extend_from_slice(&0x730Fu16.to_le_bytes());
4059                // MOVT R3, #0x0F0F
4060                bytes.extend_from_slice(&0xF6C0u16.to_le_bytes());
4061                bytes.extend_from_slice(&0x730Fu16.to_le_bytes());
4062
4063                // AND.W R4, R4, R3
4064                let hw1: u16 = (0xEA00 | 4) as u16;
4065                let hw2: u16 = ((4 << 8) | r3) as u16;
4066                bytes.extend_from_slice(&hw1.to_le_bytes());
4067                bytes.extend_from_slice(&hw2.to_le_bytes());
4068
4069                // Step 4: x = x * 0x01010101 >> 24
4070                // Load 0x01010101 into R3
4071                // MOVW R3, #0x0101
4072                bytes.extend_from_slice(&0xF240u16.to_le_bytes());
4073                bytes.extend_from_slice(&0x1301u16.to_le_bytes());
4074                // MOVT R3, #0x0101
4075                bytes.extend_from_slice(&0xF2C0u16.to_le_bytes());
4076                bytes.extend_from_slice(&0x1301u16.to_le_bytes());
4077
4078                // MUL R4, R4, R3
4079                // MUL T2: hw1 = 0xFB00|Rn, hw2 = 0xF000|(Rd<<8)|Rm
4080                let hw1: u16 = (0xFB00 | 4) as u16;
4081                let hw2: u16 = (0xF000 | (4 << 8) | r3) as u16;
4082                bytes.extend_from_slice(&hw1.to_le_bytes());
4083                bytes.extend_from_slice(&hw2.to_le_bytes());
4084
4085                // LSR.W R4, R4, #24
4086                // imm5=24=11000 → imm3=6, imm2=0, type=01(LSR)
4087                let hw1: u16 = 0xEA4F;
4088                let hw2: u16 = (0x6000 | (4 << 8) | 0x10 | 4) as u16;
4089                bytes.extend_from_slice(&hw1.to_le_bytes());
4090                bytes.extend_from_slice(&hw2.to_le_bytes());
4091
4092                // --- POPCNT for R5 (hi word) - same algorithm ---
4093                // Step 1
4094                let hw1: u16 = 0xEA4F;
4095                let hw2: u16 = ((r12 << 8) | 0x50 | 5) as u16;
4096                bytes.extend_from_slice(&hw1.to_le_bytes());
4097                bytes.extend_from_slice(&hw2.to_le_bytes());
4098
4099                // Load 0x55555555 into R3
4100                bytes.extend_from_slice(&0xF245u16.to_le_bytes());
4101                bytes.extend_from_slice(&0x5355u16.to_le_bytes());
4102                bytes.extend_from_slice(&0xF2C5u16.to_le_bytes());
4103                bytes.extend_from_slice(&0x5355u16.to_le_bytes());
4104
4105                let hw1: u16 = (0xEA00 | r12) as u16;
4106                let hw2: u16 = ((r12 << 8) | r3) as u16;
4107                bytes.extend_from_slice(&hw1.to_le_bytes());
4108                bytes.extend_from_slice(&hw2.to_le_bytes());
4109
4110                let hw1: u16 = (0xEBA0 | 5) as u16;
4111                let hw2: u16 = ((5 << 8) | r12) as u16;
4112                bytes.extend_from_slice(&hw1.to_le_bytes());
4113                bytes.extend_from_slice(&hw2.to_le_bytes());
4114
4115                // Step 2
4116                bytes.extend_from_slice(&0xF243u16.to_le_bytes());
4117                bytes.extend_from_slice(&0x3333u16.to_le_bytes());
4118                bytes.extend_from_slice(&0xF2C3u16.to_le_bytes());
4119                bytes.extend_from_slice(&0x3333u16.to_le_bytes());
4120
4121                let hw1: u16 = (0xEA00 | 5) as u16;
4122                let hw2: u16 = ((r12 << 8) | r3) as u16;
4123                bytes.extend_from_slice(&hw1.to_le_bytes());
4124                bytes.extend_from_slice(&hw2.to_le_bytes());
4125
4126                let hw1: u16 = 0xEA4F;
4127                let hw2: u16 = ((5 << 8) | 0x90 | 5) as u16;
4128                bytes.extend_from_slice(&hw1.to_le_bytes());
4129                bytes.extend_from_slice(&hw2.to_le_bytes());
4130
4131                let hw1: u16 = (0xEA00 | 5) as u16;
4132                let hw2: u16 = ((5 << 8) | r3) as u16;
4133                bytes.extend_from_slice(&hw1.to_le_bytes());
4134                bytes.extend_from_slice(&hw2.to_le_bytes());
4135
4136                let hw1: u16 = (0xEB00 | 5) as u16;
4137                let hw2: u16 = ((5 << 8) | r12) as u16;
4138                bytes.extend_from_slice(&hw1.to_le_bytes());
4139                bytes.extend_from_slice(&hw2.to_le_bytes());
4140
4141                // Step 3: LSR.W R12, R5, #4
4142                // imm5=4=00100 → imm3=1, imm2=0, type=01(LSR)
4143                let hw1: u16 = 0xEA4F;
4144                let hw2: u16 = (0x1000 | (r12 << 8) | 0x10 | 5) as u16;
4145                bytes.extend_from_slice(&hw1.to_le_bytes());
4146                bytes.extend_from_slice(&hw2.to_le_bytes());
4147
4148                let hw1: u16 = (0xEB00 | 5) as u16;
4149                let hw2: u16 = ((5 << 8) | r12) as u16;
4150                bytes.extend_from_slice(&hw1.to_le_bytes());
4151                bytes.extend_from_slice(&hw2.to_le_bytes());
4152
4153                // Load 0x0F0F0F0F into R3 (for hi-word)
4154                bytes.extend_from_slice(&0xF640u16.to_le_bytes());
4155                bytes.extend_from_slice(&0x730Fu16.to_le_bytes());
4156                bytes.extend_from_slice(&0xF6C0u16.to_le_bytes());
4157                bytes.extend_from_slice(&0x730Fu16.to_le_bytes());
4158
4159                let hw1: u16 = (0xEA00 | 5) as u16;
4160                let hw2: u16 = ((5 << 8) | r3) as u16;
4161                bytes.extend_from_slice(&hw1.to_le_bytes());
4162                bytes.extend_from_slice(&hw2.to_le_bytes());
4163
4164                // Step 4
4165                bytes.extend_from_slice(&0xF240u16.to_le_bytes());
4166                bytes.extend_from_slice(&0x1301u16.to_le_bytes());
4167                bytes.extend_from_slice(&0xF2C0u16.to_le_bytes());
4168                bytes.extend_from_slice(&0x1301u16.to_le_bytes());
4169
4170                // MUL R5, R5, R3
4171                // MUL T2: hw1 = 0xFB00|Rn, hw2 = 0xF000|(Rd<<8)|Rm
4172                let hw1: u16 = (0xFB00 | 5) as u16;
4173                let hw2: u16 = (0xF000 | (5 << 8) | r3) as u16;
4174                bytes.extend_from_slice(&hw1.to_le_bytes());
4175                bytes.extend_from_slice(&hw2.to_le_bytes());
4176
4177                // LSR.W R5, R5, #24
4178                // imm5=24=11000 → imm3=6, imm2=0, type=01(LSR)
4179                let hw1: u16 = 0xEA4F;
4180                let hw2: u16 = (0x6000 | (5 << 8) | 0x10 | 5) as u16;
4181                bytes.extend_from_slice(&hw1.to_le_bytes());
4182                bytes.extend_from_slice(&hw2.to_le_bytes());
4183
4184                // ADD rd, R4, R5 (combine lo and hi counts)
4185                // ADDS Rd, Rn, Rm (T1): 0001 100 Rm Rn Rd = 0x1800 | (Rm<<6) | (Rn<<3) | Rd
4186                let rd_bits_u16 = rd_bits as u16;
4187                let instr: u16 = 0x1800 | (5 << 6) | (4 << 3) | rd_bits_u16;
4188                bytes.extend_from_slice(&instr.to_le_bytes());
4189
4190                // POP {R3, R4, R5}
4191                bytes.extend_from_slice(&0xBC38u16.to_le_bytes());
4192
4193                // i64.popcnt returns i64, so clear high word: MOV rnhi, #0 (2 bytes)
4194                let mov0: u16 = (0x2000 | (rn_hi_bits << 8)) as u16;
4195                bytes.extend_from_slice(&mov0.to_le_bytes());
4196
4197                Ok(bytes)
4198            }
4199
4200            // I64Extend8S: Sign-extend low 8 bits to 64 bits
4201            // Result: rdlo = sign_extend_8(rnlo), rdhi = rdlo >> 31
4202            ArmOp::I64Extend8S { rdlo, rdhi, rnlo } => {
4203                let rdlo_bits = reg_to_bits(rdlo);
4204                let rdhi_bits = reg_to_bits(rdhi);
4205                let rnlo_bits = reg_to_bits(rnlo);
4206                let mut bytes = Vec::new();
4207
4208                // SXTB.W rdlo, rnlo (sign-extend byte to 32-bit)
4209                // SXTB T2: hw1 = 0xFA4F, hw2 = 0xF0<Rd><Rm>
4210                let hw1: u16 = 0xFA4F_u16;
4211                let hw2: u16 = (0xF080 | (rdlo_bits << 8) | rnlo_bits) as u16;
4212                bytes.extend_from_slice(&hw1.to_le_bytes());
4213                bytes.extend_from_slice(&hw2.to_le_bytes());
4214
4215                // ASR.W rdhi, rdlo, #31 (sign-extend to high word)
4216                // ASR (immediate): hw1 = 0xEA4F, hw2 = imm3:Rd:imm2:type:Rm
4217                // For imm5=31: imm3=111, imm2=11, type=10 (ASR)
4218                // hw2 = (7 << 12) | (rdhi << 8) | (3 << 6) | (2 << 4) | rdlo
4219                let hw1: u16 = 0xEA4F;
4220                let hw2: u16 = (0x70E0 | (rdhi_bits << 8) | rdlo_bits) as u16;
4221                bytes.extend_from_slice(&hw1.to_le_bytes());
4222                bytes.extend_from_slice(&hw2.to_le_bytes());
4223
4224                Ok(bytes)
4225            }
4226
4227            // I64Extend16S: Sign-extend low 16 bits to 64 bits
4228            // Result: rdlo = sign_extend_16(rnlo), rdhi = rdlo >> 31
4229            ArmOp::I64Extend16S { rdlo, rdhi, rnlo } => {
4230                let rdlo_bits = reg_to_bits(rdlo);
4231                let rdhi_bits = reg_to_bits(rdhi);
4232                let rnlo_bits = reg_to_bits(rnlo);
4233                let mut bytes = Vec::new();
4234
4235                // SXTH.W rdlo, rnlo (sign-extend halfword to 32-bit)
4236                // SXTH T2: hw1 = 0xFA0F, hw2 = 0xF0<Rd><Rm>
4237                let hw1: u16 = 0xFA0F_u16;
4238                let hw2: u16 = (0xF080 | (rdlo_bits << 8) | rnlo_bits) as u16;
4239                bytes.extend_from_slice(&hw1.to_le_bytes());
4240                bytes.extend_from_slice(&hw2.to_le_bytes());
4241
4242                // ASR.W rdhi, rdlo, #31 (sign-extend to high word)
4243                let hw1: u16 = 0xEA4F;
4244                let hw2: u16 = (0x70E0 | (rdhi_bits << 8) | rdlo_bits) as u16;
4245                bytes.extend_from_slice(&hw1.to_le_bytes());
4246                bytes.extend_from_slice(&hw2.to_le_bytes());
4247
4248                Ok(bytes)
4249            }
4250
4251            // I64Extend32S: Sign-extend low 32 bits to 64 bits
4252            // Result: rdlo = rnlo, rdhi = rnlo >> 31
4253            ArmOp::I64Extend32S { rdlo, rdhi, rnlo } => {
4254                let rdlo_bits = reg_to_bits(rdlo);
4255                let rdhi_bits = reg_to_bits(rdhi);
4256                let rnlo_bits = reg_to_bits(rnlo);
4257                let mut bytes = Vec::new();
4258
4259                // MOV rdlo, rnlo (if different)
4260                if rdlo_bits != rnlo_bits {
4261                    // MOV Rd, Rm (16-bit): 0100 0110 D Rm Rd[2:0]
4262                    let d_bit = ((rdlo_bits >> 3) & 1) as u16;
4263                    let mov: u16 = 0x4600
4264                        | (d_bit << 7)
4265                        | ((rnlo_bits as u16) << 3)
4266                        | ((rdlo_bits & 0x7) as u16);
4267                    bytes.extend_from_slice(&mov.to_le_bytes());
4268                }
4269
4270                // ASR.W rdhi, rnlo, #31 (sign-extend to high word)
4271                let hw1: u16 = 0xEA4F;
4272                let hw2: u16 = (0x70E0 | (rdhi_bits << 8) | rnlo_bits) as u16;
4273                bytes.extend_from_slice(&hw1.to_le_bytes());
4274                bytes.extend_from_slice(&hw2.to_le_bytes());
4275
4276                Ok(bytes)
4277            }
4278
4279            // SelectMove: IT <cond>; MOV{cond} rd, rm
4280            // Conditional move: only execute MOV if condition is true
4281            ArmOp::SelectMove { rd, rm, cond } => {
4282                let rd_bits = reg_to_bits(rd) as u16;
4283                let rm_bits = reg_to_bits(rm) as u16;
4284
4285                // Condition code encoding for IT block
4286                use synth_synthesis::Condition;
4287                let cond_bits: u16 = match cond {
4288                    Condition::EQ => 0x0, // Equal
4289                    Condition::NE => 0x1, // Not equal
4290                    Condition::HS => 0x2, // Higher or same (unsigned >=)
4291                    Condition::LO => 0x3, // Lower (unsigned <)
4292                    Condition::HI => 0x8, // Higher (unsigned >)
4293                    Condition::LS => 0x9, // Lower or same (unsigned <=)
4294                    Condition::GE => 0xA, // Greater or equal (signed)
4295                    Condition::LT => 0xB, // Less than (signed)
4296                    Condition::GT => 0xC, // Greater than (signed)
4297                    Condition::LE => 0xD, // Less or equal (signed)
4298                };
4299
4300                // IT <cond>: single Then block (mask = 0x8 for T only)
4301                // IT instruction: 1011 1111 firstcond mask
4302                let it_instr: u16 = 0xBF00 | (cond_bits << 4) | 0x8;
4303
4304                // MOV Rd, Rm (16-bit): 0100 0110 D Rm Rd[2:0]
4305                // This MOV will only execute if condition is true due to IT block
4306                let d_bit = (rd_bits >> 3) & 1;
4307                let mov_instr: u16 = 0x4600 | (d_bit << 7) | (rm_bits << 3) | (rd_bits & 0x7);
4308
4309                // Emit: IT <cond>, MOV rd, rm
4310                let mut bytes = it_instr.to_le_bytes().to_vec();
4311                bytes.extend_from_slice(&mov_instr.to_le_bytes());
4312                Ok(bytes)
4313            }
4314
4315            // Popcnt: Population count (count set bits)
4316            // ARM Cortex-M has no native POPCNT, so we implement the bit manipulation algorithm:
4317            // x = x - ((x >> 1) & 0x55555555);
4318            // x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
4319            // x = (x + (x >> 4)) & 0x0F0F0F0F;
4320            // x = x + (x >> 8);
4321            // x = x + (x >> 16);
4322            // return x & 0x3F;
4323            //
4324            // Uses rd as working register and R12 as scratch for constants
4325            ArmOp::Popcnt { rd, rm } => {
4326                let mut bytes = Vec::new();
4327
4328                // First, move rm to rd if they're different
4329                if rd != rm {
4330                    let rd_bits = reg_to_bits(rd) as u16;
4331                    let rm_bits = reg_to_bits(rm) as u16;
4332                    // MOV Rd, Rm (16-bit): 0100 0110 D Rm Rd[2:0]
4333                    let d_bit = (rd_bits >> 3) & 1;
4334                    let mov_instr: u16 = 0x4600 | (d_bit << 7) | (rm_bits << 3) | (rd_bits & 0x7);
4335                    bytes.extend_from_slice(&mov_instr.to_le_bytes());
4336                }
4337
4338                // Step 1: x = x - ((x >> 1) & 0x55555555)
4339                // Load 0x55555555 into R12
4340                bytes.extend_from_slice(&self.encode_thumb32_movw_raw(12, 0x5555)?);
4341                bytes.extend_from_slice(&self.encode_thumb32_movt_raw(12, 0x5555)?);
4342
4343                // R12_temp = rd >> 1
4344                // We need a second scratch register. Use R11.
4345                bytes.extend_from_slice(&self.encode_thumb32_lsr_raw(11, reg_to_bits(rd), 1)?);
4346
4347                // R11 = R11 & R12 (R11 = (x >> 1) & 0x55555555)
4348                bytes.extend_from_slice(&self.encode_thumb32_and_reg_raw(11, 11, 12)?);
4349
4350                // rd = rd - R11
4351                bytes.extend_from_slice(&self.encode_thumb32_sub_reg_raw(
4352                    reg_to_bits(rd),
4353                    reg_to_bits(rd),
4354                    11,
4355                )?);
4356
4357                // Step 2: x = (x & 0x33333333) + ((x >> 2) & 0x33333333)
4358                // Load 0x33333333 into R12
4359                bytes.extend_from_slice(&self.encode_thumb32_movw_raw(12, 0x3333)?);
4360                bytes.extend_from_slice(&self.encode_thumb32_movt_raw(12, 0x3333)?);
4361
4362                // R11 = rd & R12
4363                bytes.extend_from_slice(&self.encode_thumb32_and_reg_raw(
4364                    11,
4365                    reg_to_bits(rd),
4366                    12,
4367                )?);
4368
4369                // rd = rd >> 2
4370                bytes.extend_from_slice(&self.encode_thumb32_lsr_raw(
4371                    reg_to_bits(rd),
4372                    reg_to_bits(rd),
4373                    2,
4374                )?);
4375
4376                // rd = rd & R12
4377                bytes.extend_from_slice(&self.encode_thumb32_and_reg_raw(
4378                    reg_to_bits(rd),
4379                    reg_to_bits(rd),
4380                    12,
4381                )?);
4382
4383                // rd = rd + R11
4384                bytes.extend_from_slice(&self.encode_thumb32_add_reg_raw(
4385                    reg_to_bits(rd),
4386                    reg_to_bits(rd),
4387                    11,
4388                )?);
4389
4390                // Step 3: x = (x + (x >> 4)) & 0x0F0F0F0F
4391                // R11 = rd >> 4
4392                bytes.extend_from_slice(&self.encode_thumb32_lsr_raw(11, reg_to_bits(rd), 4)?);
4393
4394                // rd = rd + R11
4395                bytes.extend_from_slice(&self.encode_thumb32_add_reg_raw(
4396                    reg_to_bits(rd),
4397                    reg_to_bits(rd),
4398                    11,
4399                )?);
4400
4401                // Load 0x0F0F0F0F into R12
4402                bytes.extend_from_slice(&self.encode_thumb32_movw_raw(12, 0x0F0F)?);
4403                bytes.extend_from_slice(&self.encode_thumb32_movt_raw(12, 0x0F0F)?);
4404
4405                // rd = rd & R12
4406                bytes.extend_from_slice(&self.encode_thumb32_and_reg_raw(
4407                    reg_to_bits(rd),
4408                    reg_to_bits(rd),
4409                    12,
4410                )?);
4411
4412                // Step 4: x = x + (x >> 8)
4413                // R11 = rd >> 8
4414                bytes.extend_from_slice(&self.encode_thumb32_lsr_raw(11, reg_to_bits(rd), 8)?);
4415
4416                // rd = rd + R11
4417                bytes.extend_from_slice(&self.encode_thumb32_add_reg_raw(
4418                    reg_to_bits(rd),
4419                    reg_to_bits(rd),
4420                    11,
4421                )?);
4422
4423                // Step 5: x = x + (x >> 16)
4424                // R11 = rd >> 16
4425                bytes.extend_from_slice(&self.encode_thumb32_lsr_raw(11, reg_to_bits(rd), 16)?);
4426
4427                // rd = rd + R11
4428                bytes.extend_from_slice(&self.encode_thumb32_add_reg_raw(
4429                    reg_to_bits(rd),
4430                    reg_to_bits(rd),
4431                    11,
4432                )?);
4433
4434                // Step 6: return x & 0x3F
4435                // AND with 0x3F (small immediate, can use BIC or AND with immediate)
4436                bytes.extend_from_slice(&self.encode_thumb32_and_imm_raw(
4437                    reg_to_bits(rd),
4438                    reg_to_bits(rd),
4439                    0x3F,
4440                )?);
4441
4442                Ok(bytes)
4443            }
4444
4445            // I64DivU: 64-bit unsigned division using binary long division
4446            // Input: R0:R1 = dividend, R2:R3 = divisor
4447            // Output: R0:R1 = quotient
4448            // Uses: R4-R7, R12 as loop counter (avoid R8 for Renode compatibility)
4449            ArmOp::I64DivU {
4450                rdlo: _,
4451                rdhi: _,
4452                rnlo: _,
4453                rnhi: _,
4454                rmlo: _,
4455                rmhi: _,
4456            } => {
4457                let mut bytes = Vec::new();
4458
4459                // PUSH {R4-R7} - save scratch registers (NO LR — this is inline code)
4460                // 16-bit PUSH: 1011 010 M rrrrrrrr where M=0 (no LR), r=R4-R7 = 0xF0
4461                // Encoding: 1011 0100 1111 0000 = 0xB4F0
4462                bytes.extend_from_slice(&0xB4F0u16.to_le_bytes());
4463
4464                // Initialize quotient (R4:R5) = 0
4465                bytes.extend_from_slice(&0x2400u16.to_le_bytes()); // MOV R4, #0
4466                bytes.extend_from_slice(&0x2500u16.to_le_bytes()); // MOV R5, #0
4467
4468                // Initialize remainder (R6:R7) = 0
4469                bytes.extend_from_slice(&0x2600u16.to_le_bytes()); // MOV R6, #0
4470                bytes.extend_from_slice(&0x2700u16.to_le_bytes()); // MOV R7, #0
4471
4472                // Initialize loop counter R12 = 64 (use R12 scratch instead of R8)
4473                // MOV.W R12, #64: F04F 0C40
4474                bytes.extend_from_slice(&0xF04Fu16.to_le_bytes());
4475                bytes.extend_from_slice(&0x0C40u16.to_le_bytes());
4476
4477                // Loop start
4478                let loop_start = bytes.len();
4479
4480                // === Loop body: process one bit ===
4481
4482                // 1. Shift quotient R4:R5 left by 1
4483                // LSLS R5, R5, #1 (16-bit: 0000 0010 1010 1101 = 0x006D -> actually 0x002D for LSL R5,R5,#1)
4484                // LSL Rd, Rm, #imm5: 000 00 imm5 Rm Rd = 000 00 00001 101 101 = 0x006D
4485                bytes.extend_from_slice(&0x006Du16.to_le_bytes()); // LSLS R5, R5, #1
4486                // Get carry from R4 into R5: ORR R5, R5, R4 LSR #31
4487                // Thumb-2 ORR with shifted register: EA45 75D4 = ORR.W R5, R5, R4, LSR #31
4488                // 11101010 010 S Rn | 0 imm3 Rd imm2 type Rm
4489                // type=01 (LSR), imm5=31 (imm3=111, imm2=11)
4490                bytes.extend_from_slice(&0xEA45u16.to_le_bytes());
4491                bytes.extend_from_slice(&0x75D4u16.to_le_bytes()); // ORR.W R5, R5, R4, LSR #31
4492                // LSLS R4, R4, #1: 000 00 00001 100 100 = 0x0064
4493                bytes.extend_from_slice(&0x0064u16.to_le_bytes()); // LSLS R4, R4, #1
4494
4495                // 2. Shift remainder R6:R7 left by 1, OR in MSB of dividend R1
4496                // LSLS R7, R7, #1
4497                bytes.extend_from_slice(&0x007Fu16.to_le_bytes()); // LSLS R7, R7, #1
4498                // ORR.W R7, R7, R6, LSR #31
4499                bytes.extend_from_slice(&0xEA47u16.to_le_bytes());
4500                bytes.extend_from_slice(&0x77D6u16.to_le_bytes());
4501                // LSLS R6, R6, #1
4502                bytes.extend_from_slice(&0x0076u16.to_le_bytes()); // LSLS R6, R6, #1
4503                // ORR.W R6, R6, R1, LSR #31 (bring in MSB of dividend high)
4504                bytes.extend_from_slice(&0xEA46u16.to_le_bytes());
4505                bytes.extend_from_slice(&0x76D1u16.to_le_bytes());
4506
4507                // 3. Shift dividend R0:R1 left by 1
4508                // LSLS R1, R1, #1
4509                bytes.extend_from_slice(&0x0049u16.to_le_bytes()); // LSLS R1, R1, #1
4510                // ORR.W R1, R1, R0, LSR #31
4511                bytes.extend_from_slice(&0xEA41u16.to_le_bytes());
4512                bytes.extend_from_slice(&0x71D0u16.to_le_bytes());
4513                // LSLS R0, R0, #1
4514                bytes.extend_from_slice(&0x0040u16.to_le_bytes()); // LSLS R0, R0, #1
4515
4516                // 4. Compare remainder >= divisor (64-bit unsigned comparison)
4517                // Compare high words first: CMP R7, R3
4518                // CMP Rn, Rm encoding: 0x4280 | (Rm << 3) | Rn
4519                bytes.extend_from_slice(&0x429Fu16.to_le_bytes()); // CMP R7, R3 (16-bit)
4520                // BHI means R7 > R3 (unsigned) - definitely subtract
4521                // BLO means R7 < R3 - definitely don't subtract
4522                // BEQ means need to check low words
4523
4524                // If high > divisor high: branch to subtract (forward +offset)
4525                // BHI.N +6 (skip CMP, skip BLO, do subtract)
4526                // BHI: 1101 1000 offset8 where cond=1000 (HI)
4527                bytes.extend_from_slice(&0xD802u16.to_le_bytes()); // BHI +4 (to subtract block)
4528
4529                // If high < divisor high: branch past subtract
4530                // BLO.N +10 (skip to decrement)
4531                bytes.extend_from_slice(&0xD306u16.to_le_bytes()); // BLO/BCC +12 (past subtract)
4532
4533                // High words equal, compare low: CMP R6, R2
4534                bytes.extend_from_slice(&0x4296u16.to_le_bytes()); // CMP R6, R2 (16-bit)
4535                // BLO/BCC past subtract (skip SUBS+SBC.W+ORR.W = 10 bytes = 4 halfwords from PC+4)
4536                bytes.extend_from_slice(&0xD304u16.to_le_bytes()); // BCC +4 halfwords (past subtract)
4537
4538                // === Subtract block: remainder -= divisor, quotient |= 1 ===
4539                // SUBS R6, R6, R2
4540                bytes.extend_from_slice(&0x1AB6u16.to_le_bytes()); // SUBS R6, R6, R2 (16-bit)
4541                // SBC R7, R7, R3 (with borrow)
4542                // Thumb-2 SBC.W: EB67 0703 = SBC.W R7, R7, R3
4543                bytes.extend_from_slice(&0xEB67u16.to_le_bytes());
4544                bytes.extend_from_slice(&0x0703u16.to_le_bytes());
4545                // ORR R4, R4, #1 (set bit 0 of quotient low)
4546                bytes.extend_from_slice(&0xF044u16.to_le_bytes()); // ORR.W R4, R4, #1
4547                bytes.extend_from_slice(&0x0401u16.to_le_bytes());
4548
4549                // === Decrement counter and loop ===
4550                // SUBS.W R12, R12, #1 (decrement loop counter)
4551                // SUBS.W R12, R12, #1: F1BC 0C01
4552                bytes.extend_from_slice(&0xF1BCu16.to_le_bytes());
4553                bytes.extend_from_slice(&0x0C01u16.to_le_bytes());
4554
4555                // BNE back to loop_start
4556                let branch_offset_bytes = bytes.len() - loop_start + 4; // +4 for pipeline
4557                let offset_halfwords = -((branch_offset_bytes / 2) as i16);
4558                let bne_encoding = 0xD100u16 | ((offset_halfwords as u16) & 0xFF);
4559                bytes.extend_from_slice(&bne_encoding.to_le_bytes());
4560
4561                // === Loop done, move quotient to R0:R1 ===
4562                bytes.extend_from_slice(&0x4620u16.to_le_bytes()); // MOV R0, R4
4563                bytes.extend_from_slice(&0x4629u16.to_le_bytes()); // MOV R1, R5
4564
4565                // POP {R4-R7} - restore scratch registers (NO PC — inline code continues)
4566                // 16-bit POP: 1011 110 P rrrrrrrr where P=0 (no PC), r=R4-R7 = 0xF0
4567                // Encoding: 1011 1100 1111 0000 = 0xBCF0
4568                bytes.extend_from_slice(&0xBCF0u16.to_le_bytes());
4569
4570                Ok(bytes)
4571            }
4572
4573            // I64DivS: 64-bit signed division
4574            // Converts to unsigned, divides, then applies sign
4575            // Input: R0:R1 = dividend (signed), R2:R3 = divisor (signed)
4576            // Output: R0:R1 = quotient (signed)
4577            ArmOp::I64DivS {
4578                rdlo: _,
4579                rdhi: _,
4580                rnlo: _,
4581                rnhi: _,
4582                rmlo: _,
4583                rmhi: _,
4584            } => {
4585                let mut bytes = Vec::new();
4586
4587                // PUSH {R4-R11} - save scratch registers (NO LR — inline code)
4588                bytes.extend_from_slice(&0xE92Du16.to_le_bytes());
4589                bytes.extend_from_slice(&0x0FF0u16.to_le_bytes());
4590
4591                // Save result sign in R9: R9 = R1 XOR R3 (sign bit = MSB)
4592                // EOR.W R9, R1, R3
4593                bytes.extend_from_slice(&0xEA81u16.to_le_bytes());
4594                bytes.extend_from_slice(&0x0903u16.to_le_bytes());
4595
4596                // If dividend negative (R1 MSB set), negate it
4597                // TST R1, R1 (check sign)
4598                bytes.extend_from_slice(&0x4209u16.to_le_bytes()); // TST R1, R1
4599                // BPL skip_neg_dividend (+10 bytes = 5 halfwords)
4600                bytes.extend_from_slice(&0xD504u16.to_le_bytes()); // BPL +8
4601
4602                // Negate R0:R1 (64-bit): RSBS R0, R0, #0; SBC R1, R1, R1 LSL #1
4603                // Actually: MVN R0, R0; MVN R1, R1; ADDS R0, R0, #1; ADC R1, R1, #0
4604                bytes.extend_from_slice(&0x43C0u16.to_le_bytes()); // MVNS R0, R0
4605                bytes.extend_from_slice(&0x43C9u16.to_le_bytes()); // MVNS R1, R1
4606                bytes.extend_from_slice(&0x1C40u16.to_le_bytes()); // ADDS R0, R0, #1
4607                bytes.extend_from_slice(&0xF141u16.to_le_bytes()); // ADC.W R1, R1, #0
4608                bytes.extend_from_slice(&0x0100u16.to_le_bytes());
4609
4610                // If divisor negative (R3 MSB set), negate it
4611                bytes.extend_from_slice(&0x421Bu16.to_le_bytes()); // TST R3, R3
4612                bytes.extend_from_slice(&0xD504u16.to_le_bytes()); // BPL +8
4613
4614                // Negate R2:R3
4615                bytes.extend_from_slice(&0x43D2u16.to_le_bytes()); // MVNS R2, R2
4616                bytes.extend_from_slice(&0x43DBu16.to_le_bytes()); // MVNS R3, R3
4617                bytes.extend_from_slice(&0x1C52u16.to_le_bytes()); // ADDS R2, R2, #1
4618                bytes.extend_from_slice(&0xF143u16.to_le_bytes()); // ADC.W R3, R3, #0
4619                bytes.extend_from_slice(&0x0300u16.to_le_bytes());
4620
4621                // === Now do unsigned division (same as I64DivU) ===
4622                // Initialize quotient (R4:R5) = 0
4623                bytes.extend_from_slice(&0x2400u16.to_le_bytes());
4624                bytes.extend_from_slice(&0x2500u16.to_le_bytes());
4625                // Initialize remainder (R6:R7) = 0
4626                bytes.extend_from_slice(&0x2600u16.to_le_bytes());
4627                bytes.extend_from_slice(&0x2700u16.to_le_bytes());
4628                // Initialize loop counter R8 = 64
4629                bytes.extend_from_slice(&0xF04Fu16.to_le_bytes());
4630                bytes.extend_from_slice(&0x0840u16.to_le_bytes());
4631
4632                let loop_start = bytes.len();
4633
4634                // Shift quotient left
4635                bytes.extend_from_slice(&0x006Du16.to_le_bytes()); // LSLS R5, R5, #1
4636                bytes.extend_from_slice(&0xEA45u16.to_le_bytes()); // ORR.W R5, R5, R4, LSR #31
4637                bytes.extend_from_slice(&0x75D4u16.to_le_bytes());
4638                bytes.extend_from_slice(&0x0064u16.to_le_bytes()); // LSLS R4, R4, #1
4639
4640                // Shift remainder left, OR in MSB of dividend
4641                bytes.extend_from_slice(&0x007Fu16.to_le_bytes()); // LSLS R7, R7, #1
4642                bytes.extend_from_slice(&0xEA47u16.to_le_bytes()); // ORR.W R7, R7, R6, LSR #31
4643                bytes.extend_from_slice(&0x77D6u16.to_le_bytes());
4644                bytes.extend_from_slice(&0x0076u16.to_le_bytes()); // LSLS R6, R6, #1
4645                bytes.extend_from_slice(&0xEA46u16.to_le_bytes()); // ORR.W R6, R6, R1, LSR #31
4646                bytes.extend_from_slice(&0x76D1u16.to_le_bytes());
4647
4648                // Shift dividend left
4649                bytes.extend_from_slice(&0x0049u16.to_le_bytes()); // LSLS R1, R1, #1
4650                bytes.extend_from_slice(&0xEA41u16.to_le_bytes()); // ORR.W R1, R1, R0, LSR #31
4651                bytes.extend_from_slice(&0x71D0u16.to_le_bytes());
4652                bytes.extend_from_slice(&0x0040u16.to_le_bytes()); // LSLS R0, R0, #1
4653
4654                // Compare and conditionally subtract
4655                bytes.extend_from_slice(&0x429Fu16.to_le_bytes()); // CMP R7, R3
4656                bytes.extend_from_slice(&0xD802u16.to_le_bytes()); // BHI +4
4657                bytes.extend_from_slice(&0xD306u16.to_le_bytes()); // BCC +12
4658                bytes.extend_from_slice(&0x4296u16.to_le_bytes()); // CMP R6, R2
4659                bytes.extend_from_slice(&0xD304u16.to_le_bytes()); // BCC +4 halfwords
4660
4661                // Subtract and set quotient bit
4662                bytes.extend_from_slice(&0x1AB6u16.to_le_bytes()); // SUBS R6, R6, R2
4663                bytes.extend_from_slice(&0xEB67u16.to_le_bytes()); // SBC.W R7, R7, R3
4664                bytes.extend_from_slice(&0x0703u16.to_le_bytes());
4665                bytes.extend_from_slice(&0xF044u16.to_le_bytes()); // ORR.W R4, R4, #1
4666                bytes.extend_from_slice(&0x0401u16.to_le_bytes());
4667
4668                // Decrement and loop
4669                bytes.extend_from_slice(&0xF1B8u16.to_le_bytes()); // SUB.W R8, R8, #1
4670                bytes.extend_from_slice(&0x0801u16.to_le_bytes());
4671
4672                let branch_offset_bytes = bytes.len() - loop_start + 4;
4673                let offset_halfwords = -((branch_offset_bytes / 2) as i16);
4674                let bne_encoding = 0xD100u16 | ((offset_halfwords as u16) & 0xFF);
4675                bytes.extend_from_slice(&bne_encoding.to_le_bytes());
4676
4677                // Move quotient to R0:R1
4678                bytes.extend_from_slice(&0x4620u16.to_le_bytes()); // MOV R0, R4
4679                bytes.extend_from_slice(&0x4629u16.to_le_bytes()); // MOV R1, R5
4680
4681                // If result should be negative (R9 MSB set), negate R0:R1
4682                bytes.extend_from_slice(&0xF1B9u16.to_le_bytes()); // TST.W R9, R9 (check MSB)
4683                bytes.extend_from_slice(&0x0F00u16.to_le_bytes());
4684                bytes.extend_from_slice(&0xD504u16.to_le_bytes()); // BPL +8 (skip negation)
4685
4686                // Negate result R0:R1
4687                bytes.extend_from_slice(&0x43C0u16.to_le_bytes()); // MVNS R0, R0
4688                bytes.extend_from_slice(&0x43C9u16.to_le_bytes()); // MVNS R1, R1
4689                bytes.extend_from_slice(&0x1C40u16.to_le_bytes()); // ADDS R0, R0, #1
4690                bytes.extend_from_slice(&0xF141u16.to_le_bytes()); // ADC.W R1, R1, #0
4691                bytes.extend_from_slice(&0x0100u16.to_le_bytes());
4692
4693                // POP {R4-R11} - restore scratch registers (NO PC — inline code continues)
4694                bytes.extend_from_slice(&0xE8BDu16.to_le_bytes());
4695                bytes.extend_from_slice(&0x0FF0u16.to_le_bytes());
4696
4697                Ok(bytes)
4698            }
4699
4700            // I64RemU: 64-bit unsigned remainder using binary long division
4701            // Same algorithm as I64DivU but returns remainder instead of quotient
4702            // Input: R0:R1 = dividend, R2:R3 = divisor
4703            // Output: R0:R1 = remainder
4704            ArmOp::I64RemU {
4705                rdlo: _,
4706                rdhi: _,
4707                rnlo: _,
4708                rnhi: _,
4709                rmlo: _,
4710                rmhi: _,
4711            } => {
4712                let mut bytes = Vec::new();
4713
4714                // PUSH {R4-R8} - save scratch registers (NO LR — inline code)
4715                bytes.extend_from_slice(&0xE92Du16.to_le_bytes());
4716                bytes.extend_from_slice(&0x01F0u16.to_le_bytes());
4717
4718                // Initialize quotient (R4:R5) = 0 (computed but not returned)
4719                bytes.extend_from_slice(&0x2400u16.to_le_bytes());
4720                bytes.extend_from_slice(&0x2500u16.to_le_bytes());
4721                // Initialize remainder (R6:R7) = 0
4722                bytes.extend_from_slice(&0x2600u16.to_le_bytes());
4723                bytes.extend_from_slice(&0x2700u16.to_le_bytes());
4724                // Initialize loop counter R8 = 64
4725                bytes.extend_from_slice(&0xF04Fu16.to_le_bytes());
4726                bytes.extend_from_slice(&0x0840u16.to_le_bytes());
4727
4728                let loop_start = bytes.len();
4729
4730                // Shift quotient left (not needed for result, but keeps algorithm same)
4731                bytes.extend_from_slice(&0x006Du16.to_le_bytes()); // LSLS R5, R5, #1
4732                bytes.extend_from_slice(&0xEA45u16.to_le_bytes()); // ORR.W R5, R5, R4, LSR #31
4733                bytes.extend_from_slice(&0x75D4u16.to_le_bytes());
4734                bytes.extend_from_slice(&0x0064u16.to_le_bytes()); // LSLS R4, R4, #1
4735
4736                // Shift remainder left, OR in MSB of dividend
4737                bytes.extend_from_slice(&0x007Fu16.to_le_bytes()); // LSLS R7, R7, #1
4738                bytes.extend_from_slice(&0xEA47u16.to_le_bytes()); // ORR.W R7, R7, R6, LSR #31
4739                bytes.extend_from_slice(&0x77D6u16.to_le_bytes());
4740                bytes.extend_from_slice(&0x0076u16.to_le_bytes()); // LSLS R6, R6, #1
4741                bytes.extend_from_slice(&0xEA46u16.to_le_bytes()); // ORR.W R6, R6, R1, LSR #31
4742                bytes.extend_from_slice(&0x76D1u16.to_le_bytes());
4743
4744                // Shift dividend left
4745                bytes.extend_from_slice(&0x0049u16.to_le_bytes()); // LSLS R1, R1, #1
4746                bytes.extend_from_slice(&0xEA41u16.to_le_bytes()); // ORR.W R1, R1, R0, LSR #31
4747                bytes.extend_from_slice(&0x71D0u16.to_le_bytes());
4748                bytes.extend_from_slice(&0x0040u16.to_le_bytes()); // LSLS R0, R0, #1
4749
4750                // Compare and conditionally subtract
4751                bytes.extend_from_slice(&0x429Fu16.to_le_bytes()); // CMP R7, R3
4752                bytes.extend_from_slice(&0xD802u16.to_le_bytes()); // BHI +4
4753                bytes.extend_from_slice(&0xD306u16.to_le_bytes()); // BCC +12
4754                bytes.extend_from_slice(&0x4296u16.to_le_bytes()); // CMP R6, R2
4755                bytes.extend_from_slice(&0xD304u16.to_le_bytes()); // BCC +4 halfwords
4756
4757                // Subtract and set quotient bit
4758                bytes.extend_from_slice(&0x1AB6u16.to_le_bytes()); // SUBS R6, R6, R2
4759                bytes.extend_from_slice(&0xEB67u16.to_le_bytes()); // SBC.W R7, R7, R3
4760                bytes.extend_from_slice(&0x0703u16.to_le_bytes());
4761                bytes.extend_from_slice(&0xF044u16.to_le_bytes()); // ORR.W R4, R4, #1
4762                bytes.extend_from_slice(&0x0401u16.to_le_bytes());
4763
4764                // Decrement and loop
4765                bytes.extend_from_slice(&0xF1B8u16.to_le_bytes()); // SUB.W R8, R8, #1
4766                bytes.extend_from_slice(&0x0801u16.to_le_bytes());
4767
4768                let branch_offset_bytes = bytes.len() - loop_start + 4;
4769                let offset_halfwords = -((branch_offset_bytes / 2) as i16);
4770                let bne_encoding = 0xD100u16 | ((offset_halfwords as u16) & 0xFF);
4771                bytes.extend_from_slice(&bne_encoding.to_le_bytes());
4772
4773                // Move REMAINDER to R0:R1 (difference from I64DivU)
4774                bytes.extend_from_slice(&0x4630u16.to_le_bytes()); // MOV R0, R6
4775                bytes.extend_from_slice(&0x4639u16.to_le_bytes()); // MOV R1, R7
4776
4777                // POP {R4-R8} - restore scratch registers (NO PC — inline code continues)
4778                bytes.extend_from_slice(&0xE8BDu16.to_le_bytes());
4779                bytes.extend_from_slice(&0x01F0u16.to_le_bytes());
4780
4781                Ok(bytes)
4782            }
4783
4784            // I64RemS: 64-bit signed remainder
4785            // Remainder sign follows dividend sign (not quotient rule)
4786            // Input: R0:R1 = dividend (signed), R2:R3 = divisor (signed)
4787            // Output: R0:R1 = remainder (signed, same sign as dividend)
4788            ArmOp::I64RemS {
4789                rdlo: _,
4790                rdhi: _,
4791                rnlo: _,
4792                rnhi: _,
4793                rmlo: _,
4794                rmhi: _,
4795            } => {
4796                let mut bytes = Vec::new();
4797
4798                // PUSH {R4-R11} - save scratch registers (NO LR — inline code)
4799                bytes.extend_from_slice(&0xE92Du16.to_le_bytes());
4800                bytes.extend_from_slice(&0x0FF0u16.to_le_bytes());
4801
4802                // Save dividend sign in R9 (remainder sign = dividend sign)
4803                // MOV R9, R1 (just need the sign bit)
4804                bytes.extend_from_slice(&0x4689u16.to_le_bytes()); // MOV R9, R1
4805
4806                // If dividend negative (R1 MSB set), negate it
4807                bytes.extend_from_slice(&0x4209u16.to_le_bytes()); // TST R1, R1
4808                bytes.extend_from_slice(&0xD504u16.to_le_bytes()); // BPL +8
4809
4810                // Negate R0:R1
4811                bytes.extend_from_slice(&0x43C0u16.to_le_bytes()); // MVNS R0, R0
4812                bytes.extend_from_slice(&0x43C9u16.to_le_bytes()); // MVNS R1, R1
4813                bytes.extend_from_slice(&0x1C40u16.to_le_bytes()); // ADDS R0, R0, #1
4814                bytes.extend_from_slice(&0xF141u16.to_le_bytes()); // ADC.W R1, R1, #0
4815                bytes.extend_from_slice(&0x0100u16.to_le_bytes());
4816
4817                // If divisor negative (R3 MSB set), negate it
4818                bytes.extend_from_slice(&0x421Bu16.to_le_bytes()); // TST R3, R3
4819                bytes.extend_from_slice(&0xD504u16.to_le_bytes()); // BPL +8
4820
4821                // Negate R2:R3
4822                bytes.extend_from_slice(&0x43D2u16.to_le_bytes()); // MVNS R2, R2
4823                bytes.extend_from_slice(&0x43DBu16.to_le_bytes()); // MVNS R3, R3
4824                bytes.extend_from_slice(&0x1C52u16.to_le_bytes()); // ADDS R2, R2, #1
4825                bytes.extend_from_slice(&0xF143u16.to_le_bytes()); // ADC.W R3, R3, #0
4826                bytes.extend_from_slice(&0x0300u16.to_le_bytes());
4827
4828                // === Unsigned division algorithm ===
4829                // Initialize quotient (R4:R5) = 0
4830                bytes.extend_from_slice(&0x2400u16.to_le_bytes());
4831                bytes.extend_from_slice(&0x2500u16.to_le_bytes());
4832                // Initialize remainder (R6:R7) = 0
4833                bytes.extend_from_slice(&0x2600u16.to_le_bytes());
4834                bytes.extend_from_slice(&0x2700u16.to_le_bytes());
4835                // Initialize loop counter R8 = 64
4836                bytes.extend_from_slice(&0xF04Fu16.to_le_bytes());
4837                bytes.extend_from_slice(&0x0840u16.to_le_bytes());
4838
4839                let loop_start = bytes.len();
4840
4841                // Shift quotient left
4842                bytes.extend_from_slice(&0x006Du16.to_le_bytes()); // LSLS R5, R5, #1
4843                bytes.extend_from_slice(&0xEA45u16.to_le_bytes()); // ORR.W R5, R5, R4, LSR #31
4844                bytes.extend_from_slice(&0x75D4u16.to_le_bytes());
4845                bytes.extend_from_slice(&0x0064u16.to_le_bytes()); // LSLS R4, R4, #1
4846
4847                // Shift remainder left, OR in MSB of dividend
4848                bytes.extend_from_slice(&0x007Fu16.to_le_bytes()); // LSLS R7, R7, #1
4849                bytes.extend_from_slice(&0xEA47u16.to_le_bytes()); // ORR.W R7, R7, R6, LSR #31
4850                bytes.extend_from_slice(&0x77D6u16.to_le_bytes());
4851                bytes.extend_from_slice(&0x0076u16.to_le_bytes()); // LSLS R6, R6, #1
4852                bytes.extend_from_slice(&0xEA46u16.to_le_bytes()); // ORR.W R6, R6, R1, LSR #31
4853                bytes.extend_from_slice(&0x76D1u16.to_le_bytes());
4854
4855                // Shift dividend left
4856                bytes.extend_from_slice(&0x0049u16.to_le_bytes()); // LSLS R1, R1, #1
4857                bytes.extend_from_slice(&0xEA41u16.to_le_bytes()); // ORR.W R1, R1, R0, LSR #31
4858                bytes.extend_from_slice(&0x71D0u16.to_le_bytes());
4859                bytes.extend_from_slice(&0x0040u16.to_le_bytes()); // LSLS R0, R0, #1
4860
4861                // Compare and conditionally subtract
4862                bytes.extend_from_slice(&0x429Fu16.to_le_bytes()); // CMP R7, R3
4863                bytes.extend_from_slice(&0xD802u16.to_le_bytes()); // BHI +4
4864                bytes.extend_from_slice(&0xD306u16.to_le_bytes()); // BCC +12
4865                bytes.extend_from_slice(&0x4296u16.to_le_bytes()); // CMP R6, R2
4866                bytes.extend_from_slice(&0xD304u16.to_le_bytes()); // BCC +4 halfwords
4867
4868                // Subtract and set quotient bit
4869                bytes.extend_from_slice(&0x1AB6u16.to_le_bytes()); // SUBS R6, R6, R2
4870                bytes.extend_from_slice(&0xEB67u16.to_le_bytes()); // SBC.W R7, R7, R3
4871                bytes.extend_from_slice(&0x0703u16.to_le_bytes());
4872                bytes.extend_from_slice(&0xF044u16.to_le_bytes()); // ORR.W R4, R4, #1
4873                bytes.extend_from_slice(&0x0401u16.to_le_bytes());
4874
4875                // Decrement and loop
4876                bytes.extend_from_slice(&0xF1B8u16.to_le_bytes()); // SUB.W R8, R8, #1
4877                bytes.extend_from_slice(&0x0801u16.to_le_bytes());
4878
4879                let branch_offset_bytes = bytes.len() - loop_start + 4;
4880                let offset_halfwords = -((branch_offset_bytes / 2) as i16);
4881                let bne_encoding = 0xD100u16 | ((offset_halfwords as u16) & 0xFF);
4882                bytes.extend_from_slice(&bne_encoding.to_le_bytes());
4883
4884                // Move remainder to R0:R1
4885                bytes.extend_from_slice(&0x4630u16.to_le_bytes()); // MOV R0, R6
4886                bytes.extend_from_slice(&0x4639u16.to_le_bytes()); // MOV R1, R7
4887
4888                // If original dividend was negative (R9 MSB set), negate remainder
4889                bytes.extend_from_slice(&0xF1B9u16.to_le_bytes()); // TST.W R9, R9
4890                bytes.extend_from_slice(&0x0F00u16.to_le_bytes());
4891                bytes.extend_from_slice(&0xD504u16.to_le_bytes()); // BPL +8
4892
4893                // Negate result R0:R1
4894                bytes.extend_from_slice(&0x43C0u16.to_le_bytes()); // MVNS R0, R0
4895                bytes.extend_from_slice(&0x43C9u16.to_le_bytes()); // MVNS R1, R1
4896                bytes.extend_from_slice(&0x1C40u16.to_le_bytes()); // ADDS R0, R0, #1
4897                bytes.extend_from_slice(&0xF141u16.to_le_bytes()); // ADC.W R1, R1, #0
4898                bytes.extend_from_slice(&0x0100u16.to_le_bytes());
4899
4900                // POP {R4-R11} - restore scratch registers (NO PC — inline code continues)
4901                bytes.extend_from_slice(&0xE8BDu16.to_le_bytes());
4902                bytes.extend_from_slice(&0x0FF0u16.to_le_bytes());
4903
4904                Ok(bytes)
4905            }
4906
4907            // === F32 VFP single-precision Thumb-2 encodings ===
4908            // VFP instruction words are identical to ARM32; emit as two LE halfwords.
4909            ArmOp::F32Add { sd, sn, sm } => {
4910                Ok(vfp_to_thumb_bytes(encode_vfp_3reg(0xEE300A00, sd, sn, sm)?))
4911            }
4912            ArmOp::F32Sub { sd, sn, sm } => {
4913                Ok(vfp_to_thumb_bytes(encode_vfp_3reg(0xEE300A40, sd, sn, sm)?))
4914            }
4915            ArmOp::F32Mul { sd, sn, sm } => {
4916                Ok(vfp_to_thumb_bytes(encode_vfp_3reg(0xEE200A00, sd, sn, sm)?))
4917            }
4918            ArmOp::F32Div { sd, sn, sm } => {
4919                Ok(vfp_to_thumb_bytes(encode_vfp_3reg(0xEE800A00, sd, sn, sm)?))
4920            }
4921            ArmOp::F32Abs { sd, sm } => {
4922                Ok(vfp_to_thumb_bytes(encode_vfp_2reg(0xEEB00AC0, sd, sm)?))
4923            }
4924            ArmOp::F32Neg { sd, sm } => {
4925                Ok(vfp_to_thumb_bytes(encode_vfp_2reg(0xEEB10A40, sd, sm)?))
4926            }
4927            ArmOp::F32Sqrt { sd, sm } => {
4928                Ok(vfp_to_thumb_bytes(encode_vfp_2reg(0xEEB10AC0, sd, sm)?))
4929            }
4930
4931            // f32 pseudo-ops — multi-instruction sequences
4932            // FPSCR RMode: 00=nearest, 01=+inf(ceil), 10=-inf(floor), 11=zero(trunc)
4933            ArmOp::F32Ceil { sd, sm } => self.encode_thumb_f32_rounding(sd, sm, 0b01),
4934            ArmOp::F32Floor { sd, sm } => self.encode_thumb_f32_rounding(sd, sm, 0b10),
4935            ArmOp::F32Trunc { sd, sm } => self.encode_thumb_f32_rounding(sd, sm, 0b11),
4936            ArmOp::F32Nearest { sd, sm } => self.encode_thumb_f32_rounding(sd, sm, 0b00),
4937            ArmOp::F32Min { sd, sn, sm } => self.encode_thumb_f32_minmax(sd, sn, sm, true),
4938            ArmOp::F32Max { sd, sn, sm } => self.encode_thumb_f32_minmax(sd, sn, sm, false),
4939            ArmOp::F32Copysign { sd, sn, sm } => self.encode_thumb_f32_copysign(sd, sn, sm),
4940
4941            // f32 comparisons — VCMP + VMRS + MOV #0 + IT + MOV #1
4942            ArmOp::F32Eq { rd, sn, sm } => self.encode_thumb_f32_compare(rd, sn, sm, 0x0),
4943            ArmOp::F32Ne { rd, sn, sm } => self.encode_thumb_f32_compare(rd, sn, sm, 0x1),
4944            ArmOp::F32Lt { rd, sn, sm } => self.encode_thumb_f32_compare(rd, sn, sm, 0x4),
4945            ArmOp::F32Le { rd, sn, sm } => self.encode_thumb_f32_compare(rd, sn, sm, 0x9),
4946            ArmOp::F32Gt { rd, sn, sm } => self.encode_thumb_f32_compare(rd, sn, sm, 0xC),
4947            ArmOp::F32Ge { rd, sn, sm } => self.encode_thumb_f32_compare(rd, sn, sm, 0xA),
4948
4949            ArmOp::F32Const { sd, value } => self.encode_thumb_f32_const(sd, *value),
4950
4951            ArmOp::F32Load { sd, addr } => {
4952                Ok(vfp_to_thumb_bytes(encode_vfp_ldst(0xED900A00, sd, addr)?))
4953            }
4954            ArmOp::F32Store { sd, addr } => {
4955                Ok(vfp_to_thumb_bytes(encode_vfp_ldst(0xED800A00, sd, addr)?))
4956            }
4957
4958            ArmOp::F32ConvertI32S { sd, rm } => self.encode_thumb_f32_convert_i32(sd, rm, true),
4959            ArmOp::F32ConvertI32U { sd, rm } => self.encode_thumb_f32_convert_i32(sd, rm, false),
4960            ArmOp::F32ConvertI64S { .. } | ArmOp::F32ConvertI64U { .. } => {
4961                Err(synth_core::Error::synthesis(
4962                    "F32 i64 conversion not supported (requires register pairs on 32-bit ARM)",
4963                ))
4964            }
4965            ArmOp::F32ReinterpretI32 { sd, rm } => {
4966                Ok(vfp_to_thumb_bytes(encode_vmov_core_sreg(true, sd, rm)?))
4967            }
4968            ArmOp::I32ReinterpretF32 { rd, sm } => {
4969                Ok(vfp_to_thumb_bytes(encode_vmov_core_sreg(false, sm, rd)?))
4970            }
4971            ArmOp::I32TruncF32S { rd, sm } => self.encode_thumb_i32_trunc_f32(rd, sm, true),
4972            ArmOp::I32TruncF32U { rd, sm } => self.encode_thumb_i32_trunc_f32(rd, sm, false),
4973
4974            // === F64 VFP double-precision Thumb-2 encodings ===
4975            // VFP instruction words are identical to ARM32; emit as two LE halfwords.
4976            ArmOp::F64Add { dd, dn, dm } => Ok(vfp_to_thumb_bytes(encode_vfp_3reg_f64(
4977                0xEE300B00, dd, dn, dm,
4978            )?)),
4979            ArmOp::F64Sub { dd, dn, dm } => Ok(vfp_to_thumb_bytes(encode_vfp_3reg_f64(
4980                0xEE300B40, dd, dn, dm,
4981            )?)),
4982            ArmOp::F64Mul { dd, dn, dm } => Ok(vfp_to_thumb_bytes(encode_vfp_3reg_f64(
4983                0xEE200B00, dd, dn, dm,
4984            )?)),
4985            ArmOp::F64Div { dd, dn, dm } => Ok(vfp_to_thumb_bytes(encode_vfp_3reg_f64(
4986                0xEE800B00, dd, dn, dm,
4987            )?)),
4988            ArmOp::F64Abs { dd, dm } => {
4989                Ok(vfp_to_thumb_bytes(encode_vfp_2reg_f64(0xEEB00BC0, dd, dm)?))
4990            }
4991            ArmOp::F64Neg { dd, dm } => {
4992                Ok(vfp_to_thumb_bytes(encode_vfp_2reg_f64(0xEEB10B40, dd, dm)?))
4993            }
4994            ArmOp::F64Sqrt { dd, dm } => {
4995                Ok(vfp_to_thumb_bytes(encode_vfp_2reg_f64(0xEEB10BC0, dd, dm)?))
4996            }
4997
4998            // f64 pseudo-ops
4999            // FPSCR RMode: 00=nearest, 01=+inf(ceil), 10=-inf(floor), 11=zero(trunc)
5000            ArmOp::F64Ceil { dd, dm } => self.encode_thumb_f64_rounding(dd, dm, 0b01),
5001            ArmOp::F64Floor { dd, dm } => self.encode_thumb_f64_rounding(dd, dm, 0b10),
5002            ArmOp::F64Trunc { dd, dm } => self.encode_thumb_f64_rounding(dd, dm, 0b11),
5003            ArmOp::F64Nearest { dd, dm } => self.encode_thumb_f64_rounding(dd, dm, 0b00),
5004            ArmOp::F64Min { dd, dn, dm } => self.encode_thumb_f64_minmax(dd, dn, dm, true),
5005            ArmOp::F64Max { dd, dn, dm } => self.encode_thumb_f64_minmax(dd, dn, dm, false),
5006            ArmOp::F64Copysign { dd, dn, dm } => self.encode_thumb_f64_copysign(dd, dn, dm),
5007
5008            // f64 comparisons
5009            ArmOp::F64Eq { rd, dn, dm } => self.encode_thumb_f64_compare(rd, dn, dm, 0x0),
5010            ArmOp::F64Ne { rd, dn, dm } => self.encode_thumb_f64_compare(rd, dn, dm, 0x1),
5011            ArmOp::F64Lt { rd, dn, dm } => self.encode_thumb_f64_compare(rd, dn, dm, 0x4),
5012            ArmOp::F64Le { rd, dn, dm } => self.encode_thumb_f64_compare(rd, dn, dm, 0x9),
5013            ArmOp::F64Gt { rd, dn, dm } => self.encode_thumb_f64_compare(rd, dn, dm, 0xC),
5014            ArmOp::F64Ge { rd, dn, dm } => self.encode_thumb_f64_compare(rd, dn, dm, 0xA),
5015
5016            ArmOp::F64Const { dd, value } => self.encode_thumb_f64_const(dd, *value),
5017
5018            ArmOp::F64Load { dd, addr } => Ok(vfp_to_thumb_bytes(encode_vfp_ldst_f64(
5019                0xED900B00, dd, addr,
5020            )?)),
5021            ArmOp::F64Store { dd, addr } => Ok(vfp_to_thumb_bytes(encode_vfp_ldst_f64(
5022                0xED800B00, dd, addr,
5023            )?)),
5024
5025            ArmOp::F64ConvertI32S { dd, rm } => self.encode_thumb_f64_convert_i32(dd, rm, true),
5026            ArmOp::F64ConvertI32U { dd, rm } => self.encode_thumb_f64_convert_i32(dd, rm, false),
5027            ArmOp::F64ConvertI64S { .. } | ArmOp::F64ConvertI64U { .. } => {
5028                Err(synth_core::Error::synthesis(
5029                    "F64 i64 conversion not supported (requires register pairs on 32-bit ARM)",
5030                ))
5031            }
5032            ArmOp::F64PromoteF32 { dd, sm } => self.encode_thumb_f64_promote_f32(dd, sm),
5033            ArmOp::F64ReinterpretI64 { dd, rmlo, rmhi } => Ok(vfp_to_thumb_bytes(
5034                encode_vmov_core_dreg(true, dd, rmlo, rmhi)?,
5035            )),
5036            ArmOp::I64ReinterpretF64 { rdlo, rdhi, dm } => Ok(vfp_to_thumb_bytes(
5037                encode_vmov_core_dreg(false, dm, rdlo, rdhi)?,
5038            )),
5039            ArmOp::I64TruncF64S { .. } | ArmOp::I64TruncF64U { .. } => {
5040                Err(synth_core::Error::synthesis(
5041                    "i64 truncation from F64 not supported (requires i64 register pairs on 32-bit ARM)",
5042                ))
5043            }
5044            ArmOp::I32TruncF64S { rd, dm } => self.encode_thumb_i32_trunc_f64(rd, dm, true),
5045            ArmOp::I32TruncF64U { rd, dm } => self.encode_thumb_i32_trunc_f64(rd, dm, false),
5046
5047            // ===== i64 operations: encode as multi-instruction Thumb-2 sequences =====
5048
5049            // I64Add: ADDS rdlo, rnlo, rmlo; ADC.W rdhi, rnhi, rmhi
5050            ArmOp::I64Add {
5051                rdlo,
5052                rdhi,
5053                rnlo,
5054                rnhi,
5055                rmlo,
5056                rmhi,
5057            } => {
5058                let mut bytes = Vec::new();
5059                // ADDS rdlo, rnlo, rmlo (16-bit)
5060                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Adds {
5061                    rd: *rdlo,
5062                    rn: *rnlo,
5063                    op2: Operand2::Reg(*rmlo),
5064                })?);
5065                // ADC.W rdhi, rnhi, rmhi (32-bit)
5066                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Adc {
5067                    rd: *rdhi,
5068                    rn: *rnhi,
5069                    op2: Operand2::Reg(*rmhi),
5070                })?);
5071                Ok(bytes)
5072            }
5073
5074            // I64Sub: SUBS rdlo, rnlo, rmlo; SBC.W rdhi, rnhi, rmhi
5075            ArmOp::I64Sub {
5076                rdlo,
5077                rdhi,
5078                rnlo,
5079                rnhi,
5080                rmlo,
5081                rmhi,
5082            } => {
5083                let mut bytes = Vec::new();
5084                // SUBS rdlo, rnlo, rmlo (16-bit)
5085                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Subs {
5086                    rd: *rdlo,
5087                    rn: *rnlo,
5088                    op2: Operand2::Reg(*rmlo),
5089                })?);
5090                // SBC.W rdhi, rnhi, rmhi (32-bit)
5091                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Sbc {
5092                    rd: *rdhi,
5093                    rn: *rnhi,
5094                    op2: Operand2::Reg(*rmhi),
5095                })?);
5096                Ok(bytes)
5097            }
5098
5099            // I64And: AND rdlo, rnlo, rmlo; AND rdhi, rnhi, rmhi
5100            ArmOp::I64And {
5101                rdlo,
5102                rdhi,
5103                rnlo,
5104                rnhi,
5105                rmlo,
5106                rmhi,
5107            } => {
5108                let mut bytes = Vec::new();
5109                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::And {
5110                    rd: *rdlo,
5111                    rn: *rnlo,
5112                    op2: Operand2::Reg(*rmlo),
5113                })?);
5114                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::And {
5115                    rd: *rdhi,
5116                    rn: *rnhi,
5117                    op2: Operand2::Reg(*rmhi),
5118                })?);
5119                Ok(bytes)
5120            }
5121
5122            // I64Or: ORR rdlo, rnlo, rmlo; ORR rdhi, rnhi, rmhi
5123            ArmOp::I64Or {
5124                rdlo,
5125                rdhi,
5126                rnlo,
5127                rnhi,
5128                rmlo,
5129                rmhi,
5130            } => {
5131                let mut bytes = Vec::new();
5132                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Orr {
5133                    rd: *rdlo,
5134                    rn: *rnlo,
5135                    op2: Operand2::Reg(*rmlo),
5136                })?);
5137                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Orr {
5138                    rd: *rdhi,
5139                    rn: *rnhi,
5140                    op2: Operand2::Reg(*rmhi),
5141                })?);
5142                Ok(bytes)
5143            }
5144
5145            // I64Xor: EOR rdlo, rnlo, rmlo; EOR rdhi, rnhi, rmhi
5146            ArmOp::I64Xor {
5147                rdlo,
5148                rdhi,
5149                rnlo,
5150                rnhi,
5151                rmlo,
5152                rmhi,
5153            } => {
5154                let mut bytes = Vec::new();
5155                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Eor {
5156                    rd: *rdlo,
5157                    rn: *rnlo,
5158                    op2: Operand2::Reg(*rmlo),
5159                })?);
5160                bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Eor {
5161                    rd: *rdhi,
5162                    rn: *rnhi,
5163                    op2: Operand2::Reg(*rmhi),
5164                })?);
5165                Ok(bytes)
5166            }
5167
5168            // I64Eqz: ORR scratch, lo, hi; ITE EQ; MOV rd, #1; MOV rd, #0
5169            ArmOp::I64Eqz { rd, rnlo, rnhi } => self.encode_thumb(&ArmOp::I64SetCondZ {
5170                rd: *rd,
5171                rn_lo: *rnlo,
5172                rn_hi: *rnhi,
5173            }),
5174
5175            // I64 comparisons: delegate to I64SetCond
5176            ArmOp::I64Eq {
5177                rd,
5178                rnlo,
5179                rnhi,
5180                rmlo,
5181                rmhi,
5182            } => self.encode_thumb(&ArmOp::I64SetCond {
5183                rd: *rd,
5184                rn_lo: *rnlo,
5185                rn_hi: *rnhi,
5186                rm_lo: *rmlo,
5187                rm_hi: *rmhi,
5188                cond: synth_synthesis::Condition::EQ,
5189            }),
5190
5191            ArmOp::I64Ne {
5192                rd,
5193                rnlo,
5194                rnhi,
5195                rmlo,
5196                rmhi,
5197            } => self.encode_thumb(&ArmOp::I64SetCond {
5198                rd: *rd,
5199                rn_lo: *rnlo,
5200                rn_hi: *rnhi,
5201                rm_lo: *rmlo,
5202                rm_hi: *rmhi,
5203                cond: synth_synthesis::Condition::NE,
5204            }),
5205
5206            ArmOp::I64LtS {
5207                rd,
5208                rnlo,
5209                rnhi,
5210                rmlo,
5211                rmhi,
5212            } => self.encode_thumb(&ArmOp::I64SetCond {
5213                rd: *rd,
5214                rn_lo: *rnlo,
5215                rn_hi: *rnhi,
5216                rm_lo: *rmlo,
5217                rm_hi: *rmhi,
5218                cond: synth_synthesis::Condition::LT,
5219            }),
5220
5221            ArmOp::I64LtU {
5222                rd,
5223                rnlo,
5224                rnhi,
5225                rmlo,
5226                rmhi,
5227            } => self.encode_thumb(&ArmOp::I64SetCond {
5228                rd: *rd,
5229                rn_lo: *rnlo,
5230                rn_hi: *rnhi,
5231                rm_lo: *rmlo,
5232                rm_hi: *rmhi,
5233                cond: synth_synthesis::Condition::LO,
5234            }),
5235
5236            ArmOp::I64LeS {
5237                rd,
5238                rnlo,
5239                rnhi,
5240                rmlo,
5241                rmhi,
5242            } => self.encode_thumb(&ArmOp::I64SetCond {
5243                rd: *rd,
5244                rn_lo: *rnlo,
5245                rn_hi: *rnhi,
5246                rm_lo: *rmlo,
5247                rm_hi: *rmhi,
5248                cond: synth_synthesis::Condition::LE,
5249            }),
5250
5251            ArmOp::I64LeU {
5252                rd,
5253                rnlo,
5254                rnhi,
5255                rmlo,
5256                rmhi,
5257            } => self.encode_thumb(&ArmOp::I64SetCond {
5258                rd: *rd,
5259                rn_lo: *rnlo,
5260                rn_hi: *rnhi,
5261                rm_lo: *rmlo,
5262                rm_hi: *rmhi,
5263                cond: synth_synthesis::Condition::LS,
5264            }),
5265
5266            ArmOp::I64GtS {
5267                rd,
5268                rnlo,
5269                rnhi,
5270                rmlo,
5271                rmhi,
5272            } => self.encode_thumb(&ArmOp::I64SetCond {
5273                rd: *rd,
5274                rn_lo: *rnlo,
5275                rn_hi: *rnhi,
5276                rm_lo: *rmlo,
5277                rm_hi: *rmhi,
5278                cond: synth_synthesis::Condition::GT,
5279            }),
5280
5281            ArmOp::I64GtU {
5282                rd,
5283                rnlo,
5284                rnhi,
5285                rmlo,
5286                rmhi,
5287            } => self.encode_thumb(&ArmOp::I64SetCond {
5288                rd: *rd,
5289                rn_lo: *rnlo,
5290                rn_hi: *rnhi,
5291                rm_lo: *rmlo,
5292                rm_hi: *rmhi,
5293                cond: synth_synthesis::Condition::HI,
5294            }),
5295
5296            ArmOp::I64GeS {
5297                rd,
5298                rnlo,
5299                rnhi,
5300                rmlo,
5301                rmhi,
5302            } => self.encode_thumb(&ArmOp::I64SetCond {
5303                rd: *rd,
5304                rn_lo: *rnlo,
5305                rn_hi: *rnhi,
5306                rm_lo: *rmlo,
5307                rm_hi: *rmhi,
5308                cond: synth_synthesis::Condition::GE,
5309            }),
5310
5311            ArmOp::I64GeU {
5312                rd,
5313                rnlo,
5314                rnhi,
5315                rmlo,
5316                rmhi,
5317            } => self.encode_thumb(&ArmOp::I64SetCond {
5318                rd: *rd,
5319                rn_lo: *rnlo,
5320                rn_hi: *rnhi,
5321                rm_lo: *rmlo,
5322                rm_hi: *rmhi,
5323                cond: synth_synthesis::Condition::HS,
5324            }),
5325
5326            // I64Const: MOVW rdlo, lo16; MOVT rdlo, hi16; MOVW rdhi, lo16_hi; MOVT rdhi, hi16_hi
5327            ArmOp::I64Const { rdlo, rdhi, value } => {
5328                let lo32 = *value as u32;
5329                let hi32 = (*value >> 32) as u32;
5330                let mut bytes = Vec::new();
5331                // Load low 32 bits into rdlo
5332                bytes.extend_from_slice(
5333                    &self.encode_thumb32_movw_raw(reg_to_bits(rdlo), lo32 & 0xFFFF)?,
5334                );
5335                if lo32 > 0xFFFF {
5336                    bytes.extend_from_slice(
5337                        &self.encode_thumb32_movt_raw(reg_to_bits(rdlo), lo32 >> 16)?,
5338                    );
5339                }
5340                // Load high 32 bits into rdhi
5341                bytes.extend_from_slice(
5342                    &self.encode_thumb32_movw_raw(reg_to_bits(rdhi), hi32 & 0xFFFF)?,
5343                );
5344                if hi32 > 0xFFFF {
5345                    bytes.extend_from_slice(
5346                        &self.encode_thumb32_movt_raw(reg_to_bits(rdhi), hi32 >> 16)?,
5347                    );
5348                }
5349                Ok(bytes)
5350            }
5351
5352            // I64Ldr: LDR rdlo, [base, offset]; LDR rdhi, [base, offset+4]
5353            ArmOp::I64Ldr { rdlo, rdhi, addr } => {
5354                let mut bytes = Vec::new();
5355                let offset = if addr.offset < 0 {
5356                    0u32
5357                } else {
5358                    addr.offset as u32
5359                };
5360                // #372: a memory `i64.load` carries an index register
5361                // (`reg_imm(R11, addr_reg, offset)` = R11 + addr + offset). The
5362                // immediate `encode_thumb32_ldr` below uses only base+offset and
5363                // would SILENTLY DROP `offset_reg` — the #206 defect, here for
5364                // i64. Materialize the effective base `ip = base + index` first
5365                // (ADD.W ip, base, index — byte-verified), then load with
5366                // immediate offsets. Frame i64 loads (no `offset_reg`, e.g. a
5367                // spilled local at `[SP, #off]`) keep the plain `[base,#off]`
5368                // form unchanged — so existing output is byte-identical.
5369                let base = self.i64_effective_base(&mut bytes, addr);
5370                bytes.extend_from_slice(&self.encode_thumb32_ldr(rdlo, &base, offset)?);
5371                bytes.extend_from_slice(&self.encode_thumb32_ldr(
5372                    rdhi,
5373                    &base,
5374                    offset.wrapping_add(4),
5375                )?);
5376                Ok(bytes)
5377            }
5378
5379            // I64Str: STR rdlo, [base, offset]; STR rdhi, [base, offset+4]
5380            ArmOp::I64Str { rdlo, rdhi, addr } => {
5381                let mut bytes = Vec::new();
5382                let offset = if addr.offset < 0 {
5383                    0u32
5384                } else {
5385                    addr.offset as u32
5386                };
5387                // #372: same index-materialization as I64Ldr (see above).
5388                let base = self.i64_effective_base(&mut bytes, addr);
5389                bytes.extend_from_slice(&self.encode_thumb32_str(rdlo, &base, offset)?);
5390                bytes.extend_from_slice(&self.encode_thumb32_str(
5391                    rdhi,
5392                    &base,
5393                    offset.wrapping_add(4),
5394                )?);
5395                Ok(bytes)
5396            }
5397
5398            // I64ExtendI32S: MOV rdlo, rn; ASR rdhi, rdlo, #31 (sign-extend)
5399            ArmOp::I64ExtendI32S { rdlo, rdhi, rn } => {
5400                let mut bytes = Vec::new();
5401                if rdlo != rn {
5402                    // MOV rdlo, rn (16-bit)
5403                    bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Mov {
5404                        rd: *rdlo,
5405                        op2: Operand2::Reg(*rn),
5406                    })?);
5407                }
5408                // ASR rdhi, rdlo, #31 (sign-extend: fill high word with sign bit)
5409                bytes.extend_from_slice(
5410                    &self.encode_thumb32_shift(rdhi, rdlo, 31, 0b10)?, // ASR type
5411                );
5412                Ok(bytes)
5413            }
5414
5415            // I64ExtendI32U: MOV rdlo, rn; MOV rdhi, #0
5416            ArmOp::I64ExtendI32U { rdlo, rdhi, rn } => {
5417                let mut bytes = Vec::new();
5418                if rdlo != rn {
5419                    // MOV rdlo, rn
5420                    bytes.extend_from_slice(&self.encode_thumb(&ArmOp::Mov {
5421                        rd: *rdlo,
5422                        op2: Operand2::Reg(*rn),
5423                    })?);
5424                }
5425                // MOV rdhi, #0 (16-bit: MOVS Rd, #0)
5426                let rdhi_bits = reg_to_bits(rdhi) as u16;
5427                let instr: u16 = 0x2000 | (rdhi_bits << 8);
5428                bytes.extend_from_slice(&instr.to_le_bytes());
5429                Ok(bytes)
5430            }
5431
5432            // I32WrapI64: MOV rd, rnlo (just take low 32 bits)
5433            ArmOp::I32WrapI64 { rd, rnlo } => {
5434                if rd == rnlo {
5435                    // No-op: already in the right register
5436                    let instr: u16 = 0xBF00; // NOP
5437                    Ok(instr.to_le_bytes().to_vec())
5438                } else {
5439                    // MOV rd, rnlo
5440                    self.encode_thumb(&ArmOp::Mov {
5441                        rd: *rd,
5442                        op2: Operand2::Reg(*rnlo),
5443                    })
5444                }
5445            }
5446
5447            // ===== Helium MVE operations (Thumb-2 encoding) =====
5448            ArmOp::MveLoad { qd, addr } => Ok(vfp_to_thumb_bytes(encode_mve_vldrw(qd, addr))),
5449            ArmOp::MveStore { qd, addr } => Ok(vfp_to_thumb_bytes(encode_mve_vstrw(qd, addr))),
5450            ArmOp::MveConst { qd, bytes } => self.encode_thumb_mve_const(qd, bytes),
5451            ArmOp::MveAnd { qd, qn, qm } => Ok(vfp_to_thumb_bytes(encode_mve_3reg_bitwise(
5452                0xEF000150, qd, qn, qm,
5453            ))),
5454            ArmOp::MveOrr { qd, qn, qm } => Ok(vfp_to_thumb_bytes(encode_mve_3reg_bitwise(
5455                0xEF200150, qd, qn, qm,
5456            ))),
5457            ArmOp::MveEor { qd, qn, qm } => Ok(vfp_to_thumb_bytes(encode_mve_3reg_bitwise(
5458                0xFF000150, qd, qn, qm,
5459            ))),
5460            ArmOp::MveMvn { qd, qm } => {
5461                // VMVN Qd, Qm: 0xFFB005C0 | Qd<<12 | Qm
5462                let qd_enc = qreg_to_num(qd);
5463                let qm_enc = qreg_to_num(qm);
5464                let instr: u32 = 0xFFB005C0 | ((qd_enc * 2) << 12) | (qm_enc * 2);
5465                Ok(vfp_to_thumb_bytes(instr))
5466            }
5467            ArmOp::MveBic { qd, qn, qm } => Ok(vfp_to_thumb_bytes(encode_mve_3reg_bitwise(
5468                0xEF100150, qd, qn, qm,
5469            ))),
5470            ArmOp::MveAddI { qd, qn, qm, size } => {
5471                let sz = mve_size_bits(size);
5472                let base: u32 = 0xEF000840 | (sz << 20);
5473                Ok(vfp_to_thumb_bytes(encode_mve_3reg(base, qd, qn, qm)))
5474            }
5475            ArmOp::MveSubI { qd, qn, qm, size } => {
5476                let sz = mve_size_bits(size);
5477                let base: u32 = 0xFF000840 | (sz << 20);
5478                Ok(vfp_to_thumb_bytes(encode_mve_3reg(base, qd, qn, qm)))
5479            }
5480            ArmOp::MveMulI { qd, qn, qm, size } => {
5481                let sz = mve_size_bits(size);
5482                let base: u32 = 0xEF000950 | (sz << 20);
5483                Ok(vfp_to_thumb_bytes(encode_mve_3reg(base, qd, qn, qm)))
5484            }
5485            ArmOp::MveNegI { qd, qm, size } => {
5486                let sz = mve_size_bits(size);
5487                // VNEG.Sx Qd, Qm
5488                let qd_enc = qreg_to_num(qd);
5489                let qm_enc = qreg_to_num(qm);
5490                let base: u32 = 0xFFB103C0 | (sz << 18);
5491                let instr = base | ((qd_enc * 2) << 12) | (qm_enc * 2);
5492                Ok(vfp_to_thumb_bytes(instr))
5493            }
5494            ArmOp::MveDup { qd, rn, size } => {
5495                let sz = mve_size_bits(size);
5496                let qd_enc = qreg_to_num(qd);
5497                let rn_bits = reg_to_bits(rn);
5498                // VDUP.sz Qd, Rn: EEA0 0B10 variant
5499                // size encoding: 00=32, 01=16, 10=8
5500                let be = match sz {
5501                    0 => 0b00u32, // 8-bit
5502                    1 => 0b01,    // 16-bit
5503                    _ => 0b00,    // 32-bit (default)
5504                };
5505                let instr: u32 = 0xEEA00B10 | ((qd_enc * 2) << 16) | (rn_bits << 12) | (be << 5);
5506                Ok(vfp_to_thumb_bytes(instr))
5507            }
5508            ArmOp::MveExtractLane { rd, qn, lane, size } => {
5509                let qn_enc = qreg_to_num(qn);
5510                let rd_bits = reg_to_bits(rd);
5511                // VMOV.sz Rd, Dn[x] — extract from Q-register lane
5512                // For 32-bit: VMOV Rd, Dn — where Dn is the appropriate D-register
5513                let d_reg = qn_enc * 2 + ((*lane as u32) >> 1);
5514                let lane_in_d = (*lane as u32) & 1;
5515                let _sz = mve_size_bits(size);
5516                // VMOV Rd, Dn[x]: EE10 0B10 for 32-bit
5517                let instr: u32 = 0xEE100B10 | (d_reg << 16) | (rd_bits << 12) | (lane_in_d << 21);
5518                Ok(vfp_to_thumb_bytes(instr))
5519            }
5520            ArmOp::MveInsertLane { qd, rn, lane, size } => {
5521                let qd_enc = qreg_to_num(qd);
5522                let rn_bits = reg_to_bits(rn);
5523                let d_reg = qd_enc * 2 + ((*lane as u32) >> 1);
5524                let lane_in_d = (*lane as u32) & 1;
5525                let _sz = mve_size_bits(size);
5526                // VMOV Dn[x], Rn: EE00 0B10 for 32-bit
5527                let instr: u32 = 0xEE000B10 | (d_reg << 16) | (rn_bits << 12) | (lane_in_d << 21);
5528                Ok(vfp_to_thumb_bytes(instr))
5529            }
5530
5531            // MVE float comparisons — emit VCMP + VPSEL sequence (simplified: just VCMP)
5532            ArmOp::MveCmpEqI { qd, qn, qm, size }
5533            | ArmOp::MveCmpNeI { qd, qn, qm, size }
5534            | ArmOp::MveCmpLtS { qd, qn, qm, size }
5535            | ArmOp::MveCmpLtU { qd, qn, qm, size }
5536            | ArmOp::MveCmpGtS { qd, qn, qm, size }
5537            | ArmOp::MveCmpGtU { qd, qn, qm, size }
5538            | ArmOp::MveCmpLeS { qd, qn, qm, size }
5539            | ArmOp::MveCmpLeU { qd, qn, qm, size }
5540            | ArmOp::MveCmpGeS { qd, qn, qm, size }
5541            | ArmOp::MveCmpGeU { qd, qn, qm, size } => {
5542                // Encode as VADD (placeholder encoding — real implementation
5543                // would use VCMP + VPSEL pair)
5544                let sz = mve_size_bits(size);
5545                let base: u32 = 0xEF000840 | (sz << 20);
5546                Ok(vfp_to_thumb_bytes(encode_mve_3reg(base, qd, qn, qm)))
5547            }
5548
5549            // f32x4 MVE arithmetic
5550            ArmOp::MveAddF32 { qd, qn, qm } => {
5551                // VADD.F32 Qd, Qn, Qm (MVE): 0xEF000D40
5552                Ok(vfp_to_thumb_bytes(encode_mve_3reg(0xEF000D40, qd, qn, qm)))
5553            }
5554            ArmOp::MveSubF32 { qd, qn, qm } => {
5555                // VSUB.F32 Qd, Qn, Qm (MVE): 0xEF200D40
5556                Ok(vfp_to_thumb_bytes(encode_mve_3reg(0xEF200D40, qd, qn, qm)))
5557            }
5558            ArmOp::MveMulF32 { qd, qn, qm } => {
5559                // VMUL.F32 Qd, Qn, Qm (MVE): 0xFF000D50
5560                Ok(vfp_to_thumb_bytes(encode_mve_3reg(0xFF000D50, qd, qn, qm)))
5561            }
5562            ArmOp::MveNegF32 { qd, qm } => {
5563                let qd_enc = qreg_to_num(qd);
5564                let qm_enc = qreg_to_num(qm);
5565                // VNEG.F32 Qd, Qm: FFB907C0
5566                let instr: u32 = 0xFFB907C0 | ((qd_enc * 2) << 12) | (qm_enc * 2);
5567                Ok(vfp_to_thumb_bytes(instr))
5568            }
5569            ArmOp::MveAbsF32 { qd, qm } => {
5570                let qd_enc = qreg_to_num(qd);
5571                let qm_enc = qreg_to_num(qm);
5572                // VABS.F32 Qd, Qm: FFB90740
5573                let instr: u32 = 0xFFB90740 | ((qd_enc * 2) << 12) | (qm_enc * 2);
5574                Ok(vfp_to_thumb_bytes(instr))
5575            }
5576            ArmOp::MveCmpEqF32 { qd, qn, qm }
5577            | ArmOp::MveCmpNeF32 { qd, qn, qm }
5578            | ArmOp::MveCmpLtF32 { qd, qn, qm }
5579            | ArmOp::MveCmpLeF32 { qd, qn, qm }
5580            | ArmOp::MveCmpGtF32 { qd, qn, qm }
5581            | ArmOp::MveCmpGeF32 { qd, qn, qm } => {
5582                // Placeholder: encode as VADD.F32 (real impl needs VCMP.F32 + VPSEL)
5583                Ok(vfp_to_thumb_bytes(encode_mve_3reg(0xEF000D40, qd, qn, qm)))
5584            }
5585            ArmOp::MveDupF32 { qd, rn } => {
5586                let qd_enc = qreg_to_num(qd);
5587                let rn_bits = reg_to_bits(rn);
5588                // VDUP.32 Qd, Rn (same encoding as integer VDUP.32)
5589                let instr: u32 = 0xEEA00B10 | ((qd_enc * 2) << 16) | (rn_bits << 12);
5590                Ok(vfp_to_thumb_bytes(instr))
5591            }
5592            ArmOp::MveExtractLaneF32 { rd, qn, lane } => {
5593                let qn_enc = qreg_to_num(qn);
5594                let rd_bits = reg_to_bits(rd);
5595                // VMOV Rd, Sn where Sn = Q*4 + lane
5596                let s_num = qn_enc * 4 + (*lane as u32);
5597                let (vn, n) = encode_sreg(s_num);
5598                let instr: u32 = 0xEE100A10 | (vn << 16) | (rd_bits << 12) | (n << 7);
5599                Ok(vfp_to_thumb_bytes(instr))
5600            }
5601            ArmOp::MveReplaceLaneF32 { qd, rn, lane } => {
5602                let qd_enc = qreg_to_num(qd);
5603                let rn_bits = reg_to_bits(rn);
5604                // VMOV Sn, Rn where Sn = Q*4 + lane
5605                let s_num = qd_enc * 4 + (*lane as u32);
5606                let (vn, n) = encode_sreg(s_num);
5607                let instr: u32 = 0xEE000A10 | (vn << 16) | (rn_bits << 12) | (n << 7);
5608                Ok(vfp_to_thumb_bytes(instr))
5609            }
5610            ArmOp::MveDivF32 { qd, qn, qm } => {
5611                // Lane-wise: extract 4 S-regs, VDIV, insert back
5612                self.encode_thumb_mve_lane_wise_f32_binop(qd, qn, qm, 0xEE800A00)
5613            }
5614            ArmOp::MveSqrtF32 { qd, qm } => {
5615                // Lane-wise: extract 4 S-regs, VSQRT, insert back
5616                self.encode_thumb_mve_lane_wise_f32_sqrt(qd, qm)
5617            }
5618
5619            // Catch-all for any remaining ops
5620            _ => {
5621                let instr: u16 = 0xBF00; // NOP
5622                Ok(instr.to_le_bytes().to_vec())
5623            }
5624        }
5625    }
5626
5627    // === Thumb-2 VFP multi-instruction helpers ===
5628
5629    /// Encode F32 comparison as Thumb-2: VCMP.F32 + VMRS + MOVS rd,#0 + IT + MOV rd,#1
5630    fn encode_thumb_f32_compare(
5631        &self,
5632        rd: &Reg,
5633        sn: &VfpReg,
5634        sm: &VfpReg,
5635        cond_code: u32,
5636    ) -> Result<Vec<u8>> {
5637        let mut bytes = Vec::new();
5638        let rd_bits = reg_to_bits(rd);
5639
5640        // VCMP.F32 Sn, Sm
5641        let sn_num = vfp_sreg_to_num(sn)?;
5642        let sm_num = vfp_sreg_to_num(sm)?;
5643        let (vd, d) = encode_sreg(sn_num);
5644        let (vm, m) = encode_sreg(sm_num);
5645        let vcmp = 0xEEB40A40 | (d << 22) | (vd << 12) | (m << 5) | vm;
5646        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcmp));
5647
5648        // VMRS APSR_nzcv, FPSCR: 0xEEF1FA10
5649        bytes.extend_from_slice(&vfp_to_thumb_bytes(0xEEF1FA10));
5650
5651        // MOVS Rd, #0 (16-bit): 0010 0 Rd(3) 0000 0000
5652        if rd_bits < 8 {
5653            let movs_zero: u16 = 0x2000 | ((rd_bits as u16) << 8);
5654            bytes.extend_from_slice(&movs_zero.to_le_bytes());
5655        } else {
5656            // MOV.W Rd, #0 (32-bit Thumb-2)
5657            let hw1: u16 = 0xF04F;
5658            let hw2: u16 = (rd_bits as u16) << 8;
5659            bytes.extend_from_slice(&hw1.to_le_bytes());
5660            bytes.extend_from_slice(&hw2.to_le_bytes());
5661        }
5662
5663        // IT<cond> — If-Then for conditional MOV
5664        // IT encoding: 1011 1111 cond(4) mask(4)
5665        // mask = 0x8 for single "then" (IT)
5666        let it: u16 = 0xBF00 | ((cond_code as u16) << 4) | 0x8;
5667        bytes.extend_from_slice(&it.to_le_bytes());
5668
5669        // MOV Rd, #1 (16-bit, conditional due to IT): 0010 0 Rd(3) 0000 0001
5670        if rd_bits < 8 {
5671            let mov_one: u16 = 0x2001 | ((rd_bits as u16) << 8);
5672            bytes.extend_from_slice(&mov_one.to_le_bytes());
5673        } else {
5674            // MOV.W Rd, #1 (32-bit)
5675            let hw1: u16 = 0xF04F;
5676            let hw2: u16 = ((rd_bits as u16) << 8) | 0x01;
5677            bytes.extend_from_slice(&hw1.to_le_bytes());
5678            bytes.extend_from_slice(&hw2.to_le_bytes());
5679        }
5680
5681        Ok(bytes)
5682    }
5683
5684    /// Encode F32 constant load as Thumb-2: MOVW + MOVT + VMOV
5685    fn encode_thumb_f32_const(&self, sd: &VfpReg, value: f32) -> Result<Vec<u8>> {
5686        let mut bytes = Vec::new();
5687        let bits = value.to_bits();
5688        let rt: u32 = 12; // R12/IP as temp
5689
5690        // MOVW R12, #lo16
5691        // Thumb-2 MOVW: 11110 i 10 0100 imm4 | 0 imm3 Rd imm8
5692        let lo16 = bits & 0xFFFF;
5693        let imm4 = (lo16 >> 12) & 0xF;
5694        let i_bit = (lo16 >> 11) & 1;
5695        let imm3 = (lo16 >> 8) & 0x7;
5696        let imm8 = lo16 & 0xFF;
5697        let hw1: u16 = (0xF240 | (i_bit << 10) | imm4) as u16;
5698        let hw2: u16 = ((imm3 << 12) | (rt << 8) | imm8) as u16;
5699        bytes.extend_from_slice(&hw1.to_le_bytes());
5700        bytes.extend_from_slice(&hw2.to_le_bytes());
5701
5702        // MOVT R12, #hi16
5703        let hi16 = (bits >> 16) & 0xFFFF;
5704        let imm4 = (hi16 >> 12) & 0xF;
5705        let i_bit = (hi16 >> 11) & 1;
5706        let imm3 = (hi16 >> 8) & 0x7;
5707        let imm8 = hi16 & 0xFF;
5708        let hw1: u16 = (0xF2C0 | (i_bit << 10) | imm4) as u16;
5709        let hw2: u16 = ((imm3 << 12) | (rt << 8) | imm8) as u16;
5710        bytes.extend_from_slice(&hw1.to_le_bytes());
5711        bytes.extend_from_slice(&hw2.to_le_bytes());
5712
5713        // VMOV Sd, R12
5714        let vmov = encode_vmov_core_sreg(true, sd, &Reg::R12)?;
5715        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov));
5716
5717        Ok(bytes)
5718    }
5719
5720    /// Encode VMOV + VCVT.F32.xS32 as Thumb-2
5721    fn encode_thumb_f32_convert_i32(&self, sd: &VfpReg, rm: &Reg, signed: bool) -> Result<Vec<u8>> {
5722        let mut bytes = Vec::new();
5723
5724        // VMOV Sd, Rm
5725        let vmov = encode_vmov_core_sreg(true, sd, rm)?;
5726        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov));
5727
5728        // VCVT.F32.S32/U32 Sd, Sd
5729        let sd_num = vfp_sreg_to_num(sd)?;
5730        let (vd, d) = encode_sreg(sd_num);
5731        let (vm, m) = encode_sreg(sd_num);
5732        let base = if signed { 0xEEB80A40 } else { 0xEEB80AC0 };
5733        let vcvt = base | (d << 22) | (vd << 12) | (m << 5) | vm;
5734        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt));
5735
5736        Ok(bytes)
5737    }
5738
5739    /// Encode F32 rounding pseudo-op as Thumb-2 via VCVT to integer and back
5740    /// Encode F32 rounding as Thumb-2.
5741    /// `mode`: FPSCR RMode — 0b00=nearest, 0b01=+inf(ceil), 0b10=-inf(floor), 0b11=zero(trunc)
5742    ///
5743    /// For trunc: uses VCVTR.S32.F32 (always truncates).
5744    /// For ceil/floor/nearest: sets FPSCR rounding mode, uses VCVT.S32.F32 (non-R variant),
5745    /// then restores FPSCR.
5746    fn encode_thumb_f32_rounding(&self, sd: &VfpReg, sm: &VfpReg, mode: u8) -> Result<Vec<u8>> {
5747        let mut bytes = Vec::new();
5748        let sm_num = vfp_sreg_to_num(sm)?;
5749        let sd_num = vfp_sreg_to_num(sd)?;
5750        let (vd_s, d_s) = encode_sreg(sd_num);
5751        let (vm_s, m_s) = encode_sreg(sm_num);
5752
5753        if mode == 0b11 {
5754            // Trunc (toward zero): VCVTR.S32.F32 — bit[7]=1, always truncates
5755            let vcvt_to_int = 0xEEBD0AC0 | (d_s << 22) | (vd_s << 12) | (m_s << 5) | vm_s;
5756            bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_int));
5757        } else {
5758            // ceil/floor/nearest: manipulate FPSCR rounding mode
5759            let rt: u32 = 12; // R12/IP as temp
5760
5761            // VMRS R12, FPSCR
5762            let vmrs = 0xEEF10A10 | (rt << 12);
5763            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmrs));
5764
5765            // BIC.W R12, R12, #(3 << 22) — clear RMode bits [23:22]
5766            // Thumb-2 modified immediate for 3<<22 = 0x00C00000:
5767            // BIC.W encoding: 11110 i 0 0001 S Rn | 0 imm3 Rd imm8
5768            // 0x00C00000 = 0x03 shifted left by 22 => Thumb mod-imm: i=0, imm3=0b101, imm8=0x03
5769            let bic_hw1: u16 = 0xF020 | ((rt as u16) & 0xF); // BIC, Rn=R12
5770            let bic_hw2: u16 = (0x05 << 12) | ((rt as u16) << 8) | 0x03;
5771            bytes.extend_from_slice(&bic_hw1.to_le_bytes());
5772            bytes.extend_from_slice(&bic_hw2.to_le_bytes());
5773
5774            // ORR.W R12, R12, #(mode << 22)
5775            if mode != 0 {
5776                let orr_hw1: u16 = 0xF040 | ((rt as u16) & 0xF); // ORR, Rn=R12
5777                let orr_hw2: u16 = (0x05 << 12) | ((rt as u16) << 8) | (mode as u16);
5778                bytes.extend_from_slice(&orr_hw1.to_le_bytes());
5779                bytes.extend_from_slice(&orr_hw2.to_le_bytes());
5780            }
5781
5782            // VMSR FPSCR, R12
5783            let vmsr = 0xEEE10A10 | (rt << 12);
5784            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmsr));
5785
5786            // VCVT.S32.F32 Sd, Sm — non-R variant (bit[7]=0), uses FPSCR rmode
5787            let vcvt_to_int = 0xEEBD0A40 | (d_s << 22) | (vd_s << 12) | (m_s << 5) | vm_s;
5788            bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_int));
5789
5790            // Restore FPSCR: clear rmode bits back to nearest (default)
5791            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmrs));
5792            bytes.extend_from_slice(&bic_hw1.to_le_bytes());
5793            bytes.extend_from_slice(&bic_hw2.to_le_bytes());
5794            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmsr));
5795        }
5796
5797        // VCVT.F32.S32 Sd, Sd (convert integer result back to float)
5798        let (vd2, d2) = encode_sreg(sd_num);
5799        let vcvt_to_float = 0xEEB80A40 | (d2 << 22) | (vd2 << 12) | (d_s << 5) | vd_s;
5800        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_float));
5801
5802        Ok(bytes)
5803    }
5804
5805    /// Encode F32 min/max as Thumb-2: VMOV + VCMP + VMRS + IT + VMOV
5806    fn encode_thumb_f32_minmax(
5807        &self,
5808        sd: &VfpReg,
5809        sn: &VfpReg,
5810        sm: &VfpReg,
5811        is_min: bool,
5812    ) -> Result<Vec<u8>> {
5813        let mut bytes = Vec::new();
5814        let sn_num = vfp_sreg_to_num(sn)?;
5815        let sm_num = vfp_sreg_to_num(sm)?;
5816        let sd_num = vfp_sreg_to_num(sd)?;
5817
5818        // VMOV.F32 Sd, Sn
5819        let (vd, d) = encode_sreg(sd_num);
5820        let (vn, n) = encode_sreg(sn_num);
5821        let vmov_sn = 0xEEB00A40 | (d << 22) | (vd << 12) | (n << 5) | vn;
5822        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov_sn));
5823
5824        // VCMP.F32 Sn, Sm
5825        let (vm, m) = encode_sreg(sm_num);
5826        let vcmp = 0xEEB40A40 | (n << 22) | (vn << 12) | (m << 5) | vm;
5827        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcmp));
5828
5829        // VMRS APSR_nzcv, FPSCR
5830        bytes.extend_from_slice(&vfp_to_thumb_bytes(0xEEF1FA10));
5831
5832        // IT GT (for min) or IT MI (for max)
5833        let cond: u16 = if is_min { 0xC } else { 0x4 };
5834        let it: u16 = 0xBF00 | (cond << 4) | 0x8;
5835        bytes.extend_from_slice(&it.to_le_bytes());
5836
5837        // VMOV{cond}.F32 Sd, Sm — conditional VMOV in IT block
5838        let vmov_sm = 0xEEB00A40 | (d << 22) | (vd << 12) | (m << 5) | vm;
5839        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov_sm));
5840
5841        Ok(bytes)
5842    }
5843
5844    /// Encode F32 copysign as Thumb-2
5845    fn encode_thumb_f32_copysign(&self, sd: &VfpReg, sn: &VfpReg, sm: &VfpReg) -> Result<Vec<u8>> {
5846        let mut bytes = Vec::new();
5847
5848        // VMOV R12, Sm (get sign source bits)
5849        bytes.extend_from_slice(&vfp_to_thumb_bytes(encode_vmov_core_sreg(
5850            false,
5851            sm,
5852            &Reg::R12,
5853        )?));
5854
5855        // VMOV R0, Sn (get magnitude source bits)
5856        bytes.extend_from_slice(&vfp_to_thumb_bytes(encode_vmov_core_sreg(
5857            false,
5858            sn,
5859            &Reg::R0,
5860        )?));
5861
5862        // AND.W R12, R12, #0x80000000
5863        // Thumb-2 modified immediate: 0x80000000 = constant 0x80 with rotation
5864        // Using T1 encoding: 11110 i 0 0000 S Rn | 0 imm3 Rd imm8
5865        // 0x80000000: i=0, imm3=0b001, imm8=0x00 (rotation=4, value=0x80)
5866        // Actually encoding #0x80000000 as modified constant:
5867        // bit pattern 1 followed by 31 zeros: enc = 0b0100_00000000 = 0x0100? No.
5868        // ARM modified immediate: abcdefgh rotated. 0x80000000 = 0x80 ROR 2 = enc 0x0102
5869        // Actually: value = abcdefgh ROR (2*rot). 0x80 = 10000000, ROR 2 gives 0x20000000.
5870        // For 0x80000000: 0x02 ROR 2 = 0x80000000. So imm12 = (1<<8) | 0x02 = 0x102
5871        let hw1: u16 = 0xF000 | 12; // AND.W R12, R12, #modified_const (i=0, Rn=R12)
5872        let hw2: u16 = (0x1 << 12) | (12 << 8) | 0x02; // imm3=1, Rd=R12, imm8=0x02
5873        bytes.extend_from_slice(&hw1.to_le_bytes());
5874        bytes.extend_from_slice(&hw2.to_le_bytes());
5875
5876        // BIC.W R0, R0, #0x80000000 (R0 = register 0, fields are zero)
5877        let hw1: u16 = 0xF020; // BIC.W R0, R0, #modified_const (i=0, Rn=R0)
5878        let hw2: u16 = (0x1 << 12) | 0x02; // imm3=1, Rd=R0, imm8=0x02
5879        bytes.extend_from_slice(&hw1.to_le_bytes());
5880        bytes.extend_from_slice(&hw2.to_le_bytes());
5881
5882        // ORR.W R0, R0, R12 (R0 = register 0)
5883        let hw1: u16 = 0xEA40; // ORR.W R0, R0, R12 (Rn=R0)
5884        let hw2: u16 = 12; // Rd=R0, Rm=R12
5885        bytes.extend_from_slice(&hw1.to_le_bytes());
5886        bytes.extend_from_slice(&hw2.to_le_bytes());
5887
5888        // VMOV Sd, R0
5889        bytes.extend_from_slice(&vfp_to_thumb_bytes(encode_vmov_core_sreg(
5890            true,
5891            sd,
5892            &Reg::R0,
5893        )?));
5894
5895        Ok(bytes)
5896    }
5897
5898    /// Encode F64 comparison as Thumb-2: VCMP.F64 + VMRS + MOV #0 + IT + MOV #1
5899    fn encode_thumb_f64_compare(
5900        &self,
5901        rd: &Reg,
5902        dn: &VfpReg,
5903        dm: &VfpReg,
5904        cond_code: u32,
5905    ) -> Result<Vec<u8>> {
5906        let mut bytes = Vec::new();
5907        let rd_bits = reg_to_bits(rd);
5908
5909        // VCMP.F64 Dn, Dm
5910        let dn_num = vfp_dreg_to_num(dn)?;
5911        let dm_num = vfp_dreg_to_num(dm)?;
5912        let (vd, d) = encode_dreg(dn_num);
5913        let (vm, m) = encode_dreg(dm_num);
5914        let vcmp = 0xEEB40B40 | (d << 22) | (vd << 12) | (m << 5) | vm;
5915        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcmp));
5916
5917        // VMRS APSR_nzcv, FPSCR
5918        bytes.extend_from_slice(&vfp_to_thumb_bytes(0xEEF1FA10));
5919
5920        // MOVS Rd, #0
5921        if rd_bits < 8 {
5922            let movs_zero: u16 = 0x2000 | ((rd_bits as u16) << 8);
5923            bytes.extend_from_slice(&movs_zero.to_le_bytes());
5924        } else {
5925            let hw1: u16 = 0xF04F;
5926            let hw2: u16 = (rd_bits as u16) << 8;
5927            bytes.extend_from_slice(&hw1.to_le_bytes());
5928            bytes.extend_from_slice(&hw2.to_le_bytes());
5929        }
5930
5931        // IT<cond>
5932        let it: u16 = 0xBF00 | ((cond_code as u16) << 4) | 0x8;
5933        bytes.extend_from_slice(&it.to_le_bytes());
5934
5935        // MOV Rd, #1
5936        if rd_bits < 8 {
5937            let mov_one: u16 = 0x2001 | ((rd_bits as u16) << 8);
5938            bytes.extend_from_slice(&mov_one.to_le_bytes());
5939        } else {
5940            let hw1: u16 = 0xF04F;
5941            let hw2: u16 = ((rd_bits as u16) << 8) | 0x01;
5942            bytes.extend_from_slice(&hw1.to_le_bytes());
5943            bytes.extend_from_slice(&hw2.to_le_bytes());
5944        }
5945
5946        Ok(bytes)
5947    }
5948
5949    /// Encode F64 constant load as Thumb-2: MOVW+MOVT (lo32 into R0) + MOVW+MOVT (hi32 into R12) + VMOV Dd, R0, R12
5950    fn encode_thumb_f64_const(&self, dd: &VfpReg, value: f64) -> Result<Vec<u8>> {
5951        let mut bytes = Vec::new();
5952        let bits = value.to_bits();
5953        let lo32 = bits as u32;
5954        let hi32 = (bits >> 32) as u32;
5955
5956        // MOVW R0, #lo16(lo32)
5957        let lo16 = lo32 & 0xFFFF;
5958        bytes.extend_from_slice(&self.encode_thumb32_movw_raw(0, lo16)?);
5959
5960        // MOVT R0, #hi16(lo32)
5961        let hi16 = (lo32 >> 16) & 0xFFFF;
5962        bytes.extend_from_slice(&self.encode_thumb32_movt_raw(0, hi16)?);
5963
5964        // MOVW R12, #lo16(hi32)
5965        let lo16 = hi32 & 0xFFFF;
5966        bytes.extend_from_slice(&self.encode_thumb32_movw_raw(12, lo16)?);
5967
5968        // MOVT R12, #hi16(hi32)
5969        let hi16 = (hi32 >> 16) & 0xFFFF;
5970        bytes.extend_from_slice(&self.encode_thumb32_movt_raw(12, hi16)?);
5971
5972        // VMOV Dd, R0, R12
5973        let vmov = encode_vmov_core_dreg(true, dd, &Reg::R0, &Reg::R12)?;
5974        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov));
5975
5976        Ok(bytes)
5977    }
5978
5979    /// Encode VMOV Sd, Rm + VCVT.F64.S32/U32 Dd, Sd as Thumb-2
5980    fn encode_thumb_f64_convert_i32(&self, dd: &VfpReg, rm: &Reg, signed: bool) -> Result<Vec<u8>> {
5981        let mut bytes = Vec::new();
5982
5983        // VMOV S0, Rm
5984        let vmov = encode_vmov_core_sreg(true, &VfpReg::S0, rm)?;
5985        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov));
5986
5987        // VCVT.F64.S32 Dd, S0 or VCVT.F64.U32 Dd, S0
5988        let dd_num = vfp_dreg_to_num(dd)?;
5989        let (vd, d) = encode_dreg(dd_num);
5990        let base = if signed { 0xEEB80B40 } else { 0xEEB80BC0 };
5991        let vcvt = base | (d << 22) | (vd << 12);
5992        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt));
5993
5994        Ok(bytes)
5995    }
5996
5997    /// Encode VCVT.F64.F32 Dd, Sm as Thumb-2
5998    fn encode_thumb_f64_promote_f32(&self, dd: &VfpReg, sm: &VfpReg) -> Result<Vec<u8>> {
5999        let dd_num = vfp_dreg_to_num(dd)?;
6000        let sm_num = vfp_sreg_to_num(sm)?;
6001        let (vd, d) = encode_dreg(dd_num);
6002        let (vm, m) = encode_sreg(sm_num);
6003
6004        let vcvt = 0xEEB70AC0 | (d << 22) | (vd << 12) | (m << 5) | vm;
6005        Ok(vfp_to_thumb_bytes(vcvt))
6006    }
6007
6008    /// Encode VCVT.S32/U32.F64 S0, Dm + VMOV Rd, S0 as Thumb-2
6009    fn encode_thumb_i32_trunc_f64(&self, rd: &Reg, dm: &VfpReg, signed: bool) -> Result<Vec<u8>> {
6010        let mut bytes = Vec::new();
6011        let dm_num = vfp_dreg_to_num(dm)?;
6012        let (vm, m) = encode_dreg(dm_num);
6013
6014        // VCVT.S32.F64 S0, Dm or VCVT.U32.F64 S0, Dm
6015        let base = if signed { 0xEEBD0BC0 } else { 0xEEBC0BC0 };
6016        let vcvt = base | (m << 5) | vm;
6017        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt));
6018
6019        // VMOV Rd, S0
6020        let vmov = encode_vmov_core_sreg(false, &VfpReg::S0, rd)?;
6021        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov));
6022
6023        Ok(bytes)
6024    }
6025
6026    /// Encode F64 rounding pseudo-op as Thumb-2 via VCVT to integer and back
6027    /// Encode F64 rounding as Thumb-2.
6028    /// `mode`: FPSCR RMode — 0b00=nearest, 0b01=+inf(ceil), 0b10=-inf(floor), 0b11=zero(trunc)
6029    fn encode_thumb_f64_rounding(&self, dd: &VfpReg, dm: &VfpReg, mode: u8) -> Result<Vec<u8>> {
6030        let mut bytes = Vec::new();
6031        let dm_num = vfp_dreg_to_num(dm)?;
6032        let dd_num = vfp_dreg_to_num(dd)?;
6033        let (vm, m) = encode_dreg(dm_num);
6034        let (vd, d) = encode_dreg(dd_num);
6035
6036        if mode == 0b11 {
6037            // Trunc: VCVTR.S32.F64 — bit[7]=1, always truncates
6038            let vcvt_to_int = 0xEEBD0BC0 | (m << 5) | vm;
6039            bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_int));
6040        } else {
6041            let rt: u32 = 12;
6042
6043            // VMRS R12, FPSCR
6044            let vmrs = 0xEEF10A10 | (rt << 12);
6045            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmrs));
6046
6047            // BIC.W R12, R12, #(3 << 22)
6048            let bic_hw1: u16 = 0xF020 | ((rt as u16) & 0xF);
6049            let bic_hw2: u16 = (0x05 << 12) | ((rt as u16) << 8) | 0x03;
6050            bytes.extend_from_slice(&bic_hw1.to_le_bytes());
6051            bytes.extend_from_slice(&bic_hw2.to_le_bytes());
6052
6053            // ORR.W R12, R12, #(mode << 22)
6054            if mode != 0 {
6055                let orr_hw1: u16 = 0xF040 | ((rt as u16) & 0xF);
6056                let orr_hw2: u16 = (0x05 << 12) | ((rt as u16) << 8) | (mode as u16);
6057                bytes.extend_from_slice(&orr_hw1.to_le_bytes());
6058                bytes.extend_from_slice(&orr_hw2.to_le_bytes());
6059            }
6060
6061            // VMSR FPSCR, R12
6062            let vmsr = 0xEEE10A10 | (rt << 12);
6063            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmsr));
6064
6065            // VCVT.S32.F64 S0, Dm — non-R variant (bit[7]=0)
6066            let vcvt_to_int = 0xEEBD0B40 | (m << 5) | vm;
6067            bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_int));
6068
6069            // Restore FPSCR
6070            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmrs));
6071            bytes.extend_from_slice(&bic_hw1.to_le_bytes());
6072            bytes.extend_from_slice(&bic_hw2.to_le_bytes());
6073            bytes.extend_from_slice(&vfp_to_thumb_bytes(vmsr));
6074        }
6075
6076        // VCVT.F64.S32 Dd, S0
6077        let vcvt_to_float = 0xEEB80B40 | (d << 22) | (vd << 12);
6078        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt_to_float));
6079
6080        Ok(bytes)
6081    }
6082
6083    /// Encode F64 min/max as Thumb-2
6084    fn encode_thumb_f64_minmax(
6085        &self,
6086        dd: &VfpReg,
6087        dn: &VfpReg,
6088        dm: &VfpReg,
6089        is_min: bool,
6090    ) -> Result<Vec<u8>> {
6091        let mut bytes = Vec::new();
6092        let dn_num = vfp_dreg_to_num(dn)?;
6093        let dm_num = vfp_dreg_to_num(dm)?;
6094        let dd_num = vfp_dreg_to_num(dd)?;
6095
6096        // VMOV.F64 Dd, Dn
6097        let (vd, d) = encode_dreg(dd_num);
6098        let (vn, n) = encode_dreg(dn_num);
6099        let vmov_dn = 0xEEB00B40 | (d << 22) | (vd << 12) | (n << 5) | vn;
6100        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov_dn));
6101
6102        // VCMP.F64 Dn, Dm
6103        let (vm, m) = encode_dreg(dm_num);
6104        let vcmp = 0xEEB40B40 | (n << 22) | (vn << 12) | (m << 5) | vm;
6105        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcmp));
6106
6107        // VMRS APSR_nzcv, FPSCR
6108        bytes.extend_from_slice(&vfp_to_thumb_bytes(0xEEF1FA10));
6109
6110        // IT GT (for min) or IT MI (for max)
6111        let cond: u16 = if is_min { 0xC } else { 0x4 };
6112        let it: u16 = 0xBF00 | (cond << 4) | 0x8;
6113        bytes.extend_from_slice(&it.to_le_bytes());
6114
6115        // VMOV{cond}.F64 Dd, Dm
6116        let vmov_dm = 0xEEB00B40 | (d << 22) | (vd << 12) | (m << 5) | vm;
6117        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov_dm));
6118
6119        Ok(bytes)
6120    }
6121
6122    /// Encode F64 copysign as Thumb-2
6123    fn encode_thumb_f64_copysign(&self, dd: &VfpReg, dn: &VfpReg, dm: &VfpReg) -> Result<Vec<u8>> {
6124        let mut bytes = Vec::new();
6125
6126        // VMOV R0, R12, Dm (get sign source)
6127        bytes.extend_from_slice(&vfp_to_thumb_bytes(encode_vmov_core_dreg(
6128            false,
6129            dm,
6130            &Reg::R0,
6131            &Reg::R12,
6132        )?));
6133
6134        // VMOV R1, R2, Dn (get magnitude source)
6135        bytes.extend_from_slice(&vfp_to_thumb_bytes(encode_vmov_core_dreg(
6136            false,
6137            dn,
6138            &Reg::R1,
6139            &Reg::R2,
6140        )?));
6141
6142        // AND.W R12, R12, #0x80000000 (i=0, Rn=R12)
6143        let hw1: u16 = 0xF000 | 12;
6144        let hw2: u16 = (0x1 << 12) | (12 << 8) | 0x02;
6145        bytes.extend_from_slice(&hw1.to_le_bytes());
6146        bytes.extend_from_slice(&hw2.to_le_bytes());
6147
6148        // BIC.W R2, R2, #0x80000000 (i=0, Rn=R2)
6149        let hw1: u16 = 0xF020 | 2;
6150        let hw2: u16 = (0x1 << 12) | (2 << 8) | 0x02;
6151        bytes.extend_from_slice(&hw1.to_le_bytes());
6152        bytes.extend_from_slice(&hw2.to_le_bytes());
6153
6154        // ORR.W R2, R2, R12
6155        let hw1: u16 = 0xEA40 | 2;
6156        let hw2: u16 = (2 << 8) | 12;
6157        bytes.extend_from_slice(&hw1.to_le_bytes());
6158        bytes.extend_from_slice(&hw2.to_le_bytes());
6159
6160        // VMOV Dd, R1, R2
6161        bytes.extend_from_slice(&vfp_to_thumb_bytes(encode_vmov_core_dreg(
6162            true,
6163            dd,
6164            &Reg::R1,
6165            &Reg::R2,
6166        )?));
6167
6168        Ok(bytes)
6169    }
6170
6171    /// Encode VCVT.S32/U32.F32 + VMOV as Thumb-2
6172    fn encode_thumb_i32_trunc_f32(&self, rd: &Reg, sm: &VfpReg, signed: bool) -> Result<Vec<u8>> {
6173        let mut bytes = Vec::new();
6174
6175        let sm_num = vfp_sreg_to_num(sm)?;
6176        let (vd, d) = encode_sreg(sm_num);
6177        let (vm, m) = encode_sreg(sm_num);
6178        let base = if signed { 0xEEBD0AC0 } else { 0xEEBC0AC0 };
6179        let vcvt = base | (d << 22) | (vd << 12) | (m << 5) | vm;
6180        bytes.extend_from_slice(&vfp_to_thumb_bytes(vcvt));
6181
6182        // VMOV Rd, Sm
6183        let vmov = encode_vmov_core_sreg(false, sm, rd)?;
6184        bytes.extend_from_slice(&vfp_to_thumb_bytes(vmov));
6185
6186        Ok(bytes)
6187    }
6188
6189    // === Thumb-2 32-bit encoding helpers ===
6190
6191    /// Encode Thumb-2 32-bit ADD with immediate
6192    fn encode_thumb32_add(&self, rd: &Reg, rn: &Reg, imm: u32) -> Result<Vec<u8>> {
6193        let rd_bits = reg_to_bits(rd);
6194        let rn_bits = reg_to_bits(rn);
6195
6196        // The `i:imm3:imm8` field is split the same way for both forms.
6197        let i_bit = (imm >> 11) & 1;
6198        let imm3 = (imm >> 8) & 0x7;
6199        let imm8 = imm & 0xFF;
6200
6201        let hw1_base = if imm <= 0xFF {
6202            // ADD.W (T3): the field is a ThumbExpandImm modified immediate. For
6203            // imm <= 0xFF (i:imm3 = 0000) it is the zero-extended byte, which is
6204            // correct — keep this form so existing encodings stay bit-identical.
6205            0xF100
6206        } else if imm <= 0xFFF {
6207            // ADDW (T4): a PLAIN 12-bit immediate (0..4095) — no ThumbExpandImm.
6208            // This is what makes `add sp, sp, #frame` correct for frame sizes
6209            // >= 256, which ADD.W (T3) would silently mis-encode (e.g. #256 -> #0).
6210            0xF200
6211        } else {
6212            return Err(synth_core::Error::synthesis(
6213                "ADD immediate > 0xFFF (4095) requires a multi-instruction sequence (not supported)",
6214            ));
6215        };
6216
6217        let hw1: u16 = (hw1_base | (i_bit << 10) | rn_bits) as u16;
6218        let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
6219
6220        let mut bytes = hw1.to_le_bytes().to_vec();
6221        bytes.extend_from_slice(&hw2.to_le_bytes());
6222        Ok(bytes)
6223    }
6224
6225    /// Encode Thumb-2 32-bit SUB with immediate
6226    fn encode_thumb32_sub(&self, rd: &Reg, rn: &Reg, imm: u32) -> Result<Vec<u8>> {
6227        let rd_bits = reg_to_bits(rd);
6228        let rn_bits = reg_to_bits(rn);
6229
6230        let i_bit = (imm >> 11) & 1;
6231        let imm3 = (imm >> 8) & 0x7;
6232        let imm8 = imm & 0xFF;
6233
6234        let hw1_base = if imm <= 0xFF {
6235            // SUB.W (T3) modified immediate — correct for the zero-extended byte
6236            // (imm <= 0xFF). Kept bit-identical for existing encodings.
6237            0xF1A0
6238        } else if imm <= 0xFFF {
6239            // SUBW (T4): plain 12-bit immediate (0..4095). Makes
6240            // `sub sp, sp, #frame` correct for frame sizes >= 256.
6241            0xF2A0
6242        } else {
6243            return Err(synth_core::Error::synthesis(
6244                "SUB immediate > 0xFFF (4095) requires a multi-instruction sequence (not supported)",
6245            ));
6246        };
6247
6248        let hw1: u16 = (hw1_base | (i_bit << 10) | rn_bits) as u16;
6249        let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
6250
6251        let mut bytes = hw1.to_le_bytes().to_vec();
6252        bytes.extend_from_slice(&hw2.to_le_bytes());
6253        Ok(bytes)
6254    }
6255
6256    /// Encode Thumb-2 32-bit ADDS with immediate (sets flags)
6257    fn encode_thumb32_adds(&self, rd: &Reg, rn: &Reg, imm: u32) -> Result<Vec<u8>> {
6258        let rd_bits = reg_to_bits(rd);
6259        let rn_bits = reg_to_bits(rn);
6260
6261        // ADDS.W (flag-setting) has only the modified-immediate form — error on
6262        // an un-encodable value rather than silently add the wrong constant.
6263        let field = try_thumb_expand_imm(imm).ok_or_else(|| {
6264            synth_core::Error::synthesis(
6265                "ADDS immediate is not a valid ThumbExpandImm — materialize into a register",
6266            )
6267        })?;
6268        let i_bit = (field >> 11) & 1;
6269        let imm3 = (field >> 8) & 0x7;
6270        let imm8 = field & 0xFF;
6271
6272        // ADDS.W Rd, Rn, #imm (with S=1)
6273        // First halfword: 1111 0 i 0 1000 1 Rn = F110 | i<<10 | Rn
6274        let hw1: u16 = (0xF110 | (i_bit << 10) | rn_bits) as u16;
6275        let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
6276
6277        let mut bytes = hw1.to_le_bytes().to_vec();
6278        bytes.extend_from_slice(&hw2.to_le_bytes());
6279        Ok(bytes)
6280    }
6281
6282    /// Encode Thumb-2 32-bit SUBS with immediate (sets flags)
6283    fn encode_thumb32_subs(&self, rd: &Reg, rn: &Reg, imm: u32) -> Result<Vec<u8>> {
6284        let rd_bits = reg_to_bits(rd);
6285        let rn_bits = reg_to_bits(rn);
6286
6287        // SUBS.W (flag-setting) has only the modified-immediate form — error on
6288        // an un-encodable value rather than silently subtract the wrong constant.
6289        let field = try_thumb_expand_imm(imm).ok_or_else(|| {
6290            synth_core::Error::synthesis(
6291                "SUBS immediate is not a valid ThumbExpandImm — materialize into a register",
6292            )
6293        })?;
6294        let i_bit = (field >> 11) & 1;
6295        let imm3 = (field >> 8) & 0x7;
6296        let imm8 = field & 0xFF;
6297
6298        // SUBS.W Rd, Rn, #imm (with S=1)
6299        // First halfword: 1111 0 i 0 1101 1 Rn = F1B0 | i<<10 | Rn
6300        let hw1: u16 = (0xF1B0 | (i_bit << 10) | rn_bits) as u16;
6301        let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
6302
6303        let mut bytes = hw1.to_le_bytes().to_vec();
6304        bytes.extend_from_slice(&hw2.to_le_bytes());
6305        Ok(bytes)
6306    }
6307
6308    /// Encode Thumb-2 32-bit MOVW (16-bit immediate)
6309    ///
6310    /// # Contract (Verus-style)
6311    /// ```text
6312    /// requires rd <= R14
6313    /// ensures result.len() == 4
6314    /// ensures (imm & 0xFFFF) can be reconstructed from the encoding
6315    /// ```
6316    fn encode_thumb32_movw(&self, rd: &Reg, imm: u32) -> Result<Vec<u8>> {
6317        let rd_bits = reg_to_bits(rd);
6318        reg_bits_checked(rd_bits)?;
6319        let imm16 = imm & 0xFFFF;
6320
6321        // MOVW Rd, #imm16
6322        // 1111 0 i 10 0 1 0 0 imm4 | 0 imm3 Rd imm8
6323        let imm4 = (imm16 >> 12) & 0xF;
6324        let i_bit = (imm16 >> 11) & 1;
6325        let imm3 = (imm16 >> 8) & 0x7;
6326        let imm8 = imm16 & 0xFF;
6327
6328        let hw1: u16 = (0xF240 | (i_bit << 10) | imm4) as u16;
6329        let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
6330
6331        let mut bytes = hw1.to_le_bytes().to_vec();
6332        bytes.extend_from_slice(&hw2.to_le_bytes());
6333        encoding_contracts::verify_thumb32(&bytes);
6334        Ok(bytes)
6335    }
6336
6337    /// Encode Thumb-2 32-bit shift with immediate
6338    ///
6339    /// # Contract (Verus-style)
6340    /// ```text
6341    /// requires rd <= R14, rm <= R14
6342    /// ensures result.len() == 4
6343    /// ```
6344    fn encode_thumb32_shift(
6345        &self,
6346        rd: &Reg,
6347        rm: &Reg,
6348        shift: u32,
6349        shift_type: u8,
6350    ) -> Result<Vec<u8>> {
6351        let rd_bits = reg_to_bits(rd);
6352        let rm_bits = reg_to_bits(rm);
6353        reg_bits_checked(rd_bits)?;
6354        reg_bits_checked(rm_bits)?;
6355        let imm5 = shift & 0x1F;
6356        let imm2 = imm5 & 0x3;
6357        let imm3 = (imm5 >> 2) & 0x7;
6358
6359        // MOV.W Rd, Rm, <shift> #imm
6360        // EA4F 0 imm3 Rd imm2 type Rm
6361        let hw1: u16 = 0xEA4F;
6362        let hw2: u16 =
6363            ((imm3 << 12) | (rd_bits << 8) | (imm2 << 6) | ((shift_type as u32) << 4) | rm_bits)
6364                as u16;
6365
6366        let mut bytes = hw1.to_le_bytes().to_vec();
6367        bytes.extend_from_slice(&hw2.to_le_bytes());
6368        Ok(bytes)
6369    }
6370
6371    /// Encode Thumb-2 32-bit shift by register
6372    /// Encoding: 11111010 0xx0 Rn | 1111 Rd 0000 Rm
6373    /// shift_type: 00=LSL, 01=LSR, 10=ASR, 11=ROR
6374    fn encode_thumb32_shift_reg(
6375        &self,
6376        rd: &Reg,
6377        rn: &Reg,
6378        rm: &Reg,
6379        shift_type: u8,
6380    ) -> Result<Vec<u8>> {
6381        let rd_bits = reg_to_bits(rd);
6382        let rn_bits = reg_to_bits(rn);
6383        let rm_bits = reg_to_bits(rm);
6384
6385        // hw1: 1111 1010 0xx0 Rn
6386        let hw1: u16 = (0xFA00 | ((shift_type as u32) << 5) | rn_bits) as u16;
6387        // hw2: 1111 Rd 0000 Rm
6388        let hw2: u16 = (0xF000 | (rd_bits << 8) | rm_bits) as u16;
6389
6390        let mut bytes = hw1.to_le_bytes().to_vec();
6391        bytes.extend_from_slice(&hw2.to_le_bytes());
6392        Ok(bytes)
6393    }
6394
6395    /// Encode Thumb-2 32-bit CMP with immediate
6396    fn encode_thumb32_cmp_imm(&self, rn: &Reg, imm: u32) -> Result<Vec<u8>> {
6397        let rn_bits = reg_to_bits(rn);
6398
6399        // CMP.W has only the modified-immediate form (no plain-imm12 like ADDW),
6400        // so an un-encodable immediate MUST be materialized into a register by
6401        // the selector. Error rather than silently compare the wrong constant.
6402        let field = try_thumb_expand_imm(imm).ok_or_else(|| {
6403            synth_core::Error::synthesis(
6404                "CMP immediate is not a valid ThumbExpandImm — materialize into a register",
6405            )
6406        })?;
6407        let i_bit = (field >> 11) & 1;
6408        let imm3 = (field >> 8) & 0x7;
6409        let imm8 = field & 0xFF;
6410
6411        // CMP.W Rn, #imm
6412        let hw1: u16 = (0xF1B0 | (i_bit << 10) | rn_bits) as u16;
6413        let hw2: u16 = ((imm3 << 12) | 0x0F00 | imm8) as u16;
6414
6415        let mut bytes = hw1.to_le_bytes().to_vec();
6416        bytes.extend_from_slice(&hw2.to_le_bytes());
6417        Ok(bytes)
6418    }
6419
6420    /// #372: resolve the base register for an `I64Ldr`/`I64Str` whose address
6421    /// may carry an index register. If `addr.offset_reg` is set (a memory
6422    /// `i64.load`/`i64.store`: `R11 + addr + offset`), emit `ADD.W ip, base,
6423    /// index` and return `ip` (R12) as the base for the two immediate-offset
6424    /// halves. If unset (a frame access at `[base, #off]`), return `addr.base`
6425    /// unchanged — emitting nothing — so non-indexed i64 access is byte-identical.
6426    /// `ip = base + index` is computed BEFORE the halves load, so an `rdlo`
6427    /// aliasing the index register is safe (the address is already materialized).
6428    fn i64_effective_base(&self, bytes: &mut Vec<u8>, addr: &MemAddr) -> Reg {
6429        match addr.offset_reg {
6430            Some(idx) => {
6431                let ip = Reg::R12;
6432                // ADD.W ip, addr.base, idx  (Thumb-2, byte-verified vs as)
6433                let hw1: u16 = 0xEB00 | reg_to_bits(&addr.base) as u16;
6434                let hw2: u16 = 0x0C00 | reg_to_bits(&idx) as u16;
6435                bytes.extend_from_slice(&hw1.to_le_bytes());
6436                bytes.extend_from_slice(&hw2.to_le_bytes());
6437                ip
6438            }
6439            None => addr.base,
6440        }
6441    }
6442
6443    /// Encode Thumb-2 32-bit LDR
6444    fn encode_thumb32_ldr(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6445        let rd_bits = reg_to_bits(rd);
6446        let base_bits = reg_to_bits(base);
6447
6448        // LDR.W Rd, [Rn, #imm12]
6449        check_ldst_imm12(offset)?;
6450        let hw1: u16 = (0xF8D0 | base_bits) as u16;
6451        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6452
6453        let mut bytes = hw1.to_le_bytes().to_vec();
6454        bytes.extend_from_slice(&hw2.to_le_bytes());
6455        Ok(bytes)
6456    }
6457
6458    /// Encode Thumb-2 32-bit STR
6459    fn encode_thumb32_str(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6460        let rd_bits = reg_to_bits(rd);
6461        let base_bits = reg_to_bits(base);
6462
6463        // STR.W Rd, [Rn, #imm12]
6464        check_ldst_imm12(offset)?;
6465        let hw1: u16 = (0xF8C0 | base_bits) as u16;
6466        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6467
6468        let mut bytes = hw1.to_le_bytes().to_vec();
6469        bytes.extend_from_slice(&hw2.to_le_bytes());
6470        Ok(bytes)
6471    }
6472
6473    /// Encode Thumb-2 32-bit LDR with register offset: LDR.W Rd, [Rn, Rm]
6474    fn encode_thumb32_ldr_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6475        let rd_bits = reg_to_bits(rd);
6476        let base_bits = reg_to_bits(base);
6477        let rm_bits = reg_to_bits(offset_reg);
6478
6479        // LDR.W Rd, [Rn, Rm, LSL #0]
6480        // Encoding: 1111 1000 0101 Rn | Rt 0000 00 imm2 Rm
6481        // imm2 = 00 for no shift (LSL #0)
6482        let hw1: u16 = (0xF850 | base_bits) as u16;
6483        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6484
6485        let mut bytes = hw1.to_le_bytes().to_vec();
6486        bytes.extend_from_slice(&hw2.to_le_bytes());
6487        Ok(bytes)
6488    }
6489
6490    /// Encode Thumb-2 32-bit STR with register offset: STR.W Rd, [Rn, Rm]
6491    fn encode_thumb32_str_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6492        let rd_bits = reg_to_bits(rd);
6493        let base_bits = reg_to_bits(base);
6494        let rm_bits = reg_to_bits(offset_reg);
6495
6496        // STR.W Rd, [Rn, Rm, LSL #0]
6497        // Encoding: 1111 1000 0100 Rn | Rt 0000 00 imm2 Rm
6498        // imm2 = 00 for no shift (LSL #0)
6499        let hw1: u16 = (0xF840 | base_bits) as u16;
6500        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6501
6502        let mut bytes = hw1.to_le_bytes().to_vec();
6503        bytes.extend_from_slice(&hw2.to_le_bytes());
6504        Ok(bytes)
6505    }
6506
6507    // === Sub-word load/store Thumb-2 encoding helpers ===
6508
6509    /// Encode Thumb-2 32-bit LDRB with immediate: LDRB.W Rd, [Rn, #imm12]
6510    fn encode_thumb32_ldrb_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6511        let rd_bits = reg_to_bits(rd);
6512        let base_bits = reg_to_bits(base);
6513        // LDRB.W Rd, [Rn, #imm12]: 1111 1000 1001 Rn | Rt imm12
6514        check_ldst_imm12(offset)?;
6515        let hw1: u16 = (0xF890 | base_bits) as u16;
6516        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6517        let mut bytes = hw1.to_le_bytes().to_vec();
6518        bytes.extend_from_slice(&hw2.to_le_bytes());
6519        Ok(bytes)
6520    }
6521
6522    /// Encode Thumb-2 32-bit LDRB with register: LDRB.W Rd, [Rn, Rm]
6523    fn encode_thumb32_ldrb_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6524        let rd_bits = reg_to_bits(rd);
6525        let base_bits = reg_to_bits(base);
6526        let rm_bits = reg_to_bits(offset_reg);
6527        // LDRB.W Rd, [Rn, Rm, LSL #0]: 1111 1000 0001 Rn | Rt 0000 00 imm2 Rm
6528        let hw1: u16 = (0xF810 | base_bits) as u16;
6529        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6530        let mut bytes = hw1.to_le_bytes().to_vec();
6531        bytes.extend_from_slice(&hw2.to_le_bytes());
6532        Ok(bytes)
6533    }
6534
6535    /// Encode Thumb-2 32-bit LDRSB with immediate: LDRSB.W Rd, [Rn, #imm12]
6536    fn encode_thumb32_ldrsb_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6537        let rd_bits = reg_to_bits(rd);
6538        let base_bits = reg_to_bits(base);
6539        // LDRSB.W Rd, [Rn, #imm12]: 1111 1001 1001 Rn | Rt imm12
6540        check_ldst_imm12(offset)?;
6541        let hw1: u16 = (0xF990 | base_bits) as u16;
6542        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6543        let mut bytes = hw1.to_le_bytes().to_vec();
6544        bytes.extend_from_slice(&hw2.to_le_bytes());
6545        Ok(bytes)
6546    }
6547
6548    /// Encode Thumb-2 32-bit LDRSB with register: LDRSB.W Rd, [Rn, Rm]
6549    fn encode_thumb32_ldrsb_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6550        let rd_bits = reg_to_bits(rd);
6551        let base_bits = reg_to_bits(base);
6552        let rm_bits = reg_to_bits(offset_reg);
6553        // LDRSB.W Rd, [Rn, Rm, LSL #0]: 1111 1001 0001 Rn | Rt 0000 00 imm2 Rm
6554        let hw1: u16 = (0xF910 | base_bits) as u16;
6555        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6556        let mut bytes = hw1.to_le_bytes().to_vec();
6557        bytes.extend_from_slice(&hw2.to_le_bytes());
6558        Ok(bytes)
6559    }
6560
6561    /// Encode Thumb-2 32-bit LDRH with immediate: LDRH.W Rd, [Rn, #imm12]
6562    fn encode_thumb32_ldrh_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6563        let rd_bits = reg_to_bits(rd);
6564        let base_bits = reg_to_bits(base);
6565        // LDRH.W Rd, [Rn, #imm12]: 1111 1000 1011 Rn | Rt imm12
6566        check_ldst_imm12(offset)?;
6567        let hw1: u16 = (0xF8B0 | base_bits) as u16;
6568        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6569        let mut bytes = hw1.to_le_bytes().to_vec();
6570        bytes.extend_from_slice(&hw2.to_le_bytes());
6571        Ok(bytes)
6572    }
6573
6574    /// Encode Thumb-2 32-bit LDRH with register: LDRH.W Rd, [Rn, Rm]
6575    fn encode_thumb32_ldrh_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6576        let rd_bits = reg_to_bits(rd);
6577        let base_bits = reg_to_bits(base);
6578        let rm_bits = reg_to_bits(offset_reg);
6579        // LDRH.W Rd, [Rn, Rm, LSL #0]: 1111 1000 0011 Rn | Rt 0000 00 imm2 Rm
6580        let hw1: u16 = (0xF830 | base_bits) as u16;
6581        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6582        let mut bytes = hw1.to_le_bytes().to_vec();
6583        bytes.extend_from_slice(&hw2.to_le_bytes());
6584        Ok(bytes)
6585    }
6586
6587    /// Encode Thumb-2 32-bit LDRSH with immediate: LDRSH.W Rd, [Rn, #imm12]
6588    fn encode_thumb32_ldrsh_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6589        let rd_bits = reg_to_bits(rd);
6590        let base_bits = reg_to_bits(base);
6591        // LDRSH.W Rd, [Rn, #imm12]: 1111 1001 1011 Rn | Rt imm12
6592        check_ldst_imm12(offset)?;
6593        let hw1: u16 = (0xF9B0 | base_bits) as u16;
6594        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6595        let mut bytes = hw1.to_le_bytes().to_vec();
6596        bytes.extend_from_slice(&hw2.to_le_bytes());
6597        Ok(bytes)
6598    }
6599
6600    /// Encode Thumb-2 32-bit LDRSH with register: LDRSH.W Rd, [Rn, Rm]
6601    fn encode_thumb32_ldrsh_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6602        let rd_bits = reg_to_bits(rd);
6603        let base_bits = reg_to_bits(base);
6604        let rm_bits = reg_to_bits(offset_reg);
6605        // LDRSH.W Rd, [Rn, Rm, LSL #0]: 1111 1001 0011 Rn | Rt 0000 00 imm2 Rm
6606        let hw1: u16 = (0xF930 | base_bits) as u16;
6607        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6608        let mut bytes = hw1.to_le_bytes().to_vec();
6609        bytes.extend_from_slice(&hw2.to_le_bytes());
6610        Ok(bytes)
6611    }
6612
6613    /// Encode Thumb-2 32-bit STRB with immediate: STRB.W Rd, [Rn, #imm12]
6614    fn encode_thumb32_strb_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6615        let rd_bits = reg_to_bits(rd);
6616        let base_bits = reg_to_bits(base);
6617        // STRB.W Rd, [Rn, #imm12]: 1111 1000 1000 Rn | Rt imm12
6618        check_ldst_imm12(offset)?;
6619        let hw1: u16 = (0xF880 | base_bits) as u16;
6620        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6621        let mut bytes = hw1.to_le_bytes().to_vec();
6622        bytes.extend_from_slice(&hw2.to_le_bytes());
6623        Ok(bytes)
6624    }
6625
6626    /// Encode Thumb-2 32-bit STRB with register: STRB.W Rd, [Rn, Rm]
6627    fn encode_thumb32_strb_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6628        let rd_bits = reg_to_bits(rd);
6629        let base_bits = reg_to_bits(base);
6630        let rm_bits = reg_to_bits(offset_reg);
6631        // STRB.W Rd, [Rn, Rm, LSL #0]: 1111 1000 0000 Rn | Rt 0000 00 imm2 Rm
6632        let hw1: u16 = (0xF800 | base_bits) as u16;
6633        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6634        let mut bytes = hw1.to_le_bytes().to_vec();
6635        bytes.extend_from_slice(&hw2.to_le_bytes());
6636        Ok(bytes)
6637    }
6638
6639    /// Encode Thumb-2 32-bit STRH with immediate: STRH.W Rd, [Rn, #imm12]
6640    fn encode_thumb32_strh_imm(&self, rd: &Reg, base: &Reg, offset: u32) -> Result<Vec<u8>> {
6641        let rd_bits = reg_to_bits(rd);
6642        let base_bits = reg_to_bits(base);
6643        // STRH.W Rd, [Rn, #imm12]: 1111 1000 1010 Rn | Rt imm12
6644        check_ldst_imm12(offset)?;
6645        let hw1: u16 = (0xF8A0 | base_bits) as u16;
6646        let hw2: u16 = ((rd_bits << 12) | (offset & 0xFFF)) as u16;
6647        let mut bytes = hw1.to_le_bytes().to_vec();
6648        bytes.extend_from_slice(&hw2.to_le_bytes());
6649        Ok(bytes)
6650    }
6651
6652    /// Encode Thumb-2 32-bit STRH with register: STRH.W Rd, [Rn, Rm]
6653    fn encode_thumb32_strh_reg(&self, rd: &Reg, base: &Reg, offset_reg: &Reg) -> Result<Vec<u8>> {
6654        let rd_bits = reg_to_bits(rd);
6655        let base_bits = reg_to_bits(base);
6656        let rm_bits = reg_to_bits(offset_reg);
6657        // STRH.W Rd, [Rn, Rm, LSL #0]: 1111 1000 0010 Rn | Rt 0000 00 imm2 Rm
6658        let hw1: u16 = (0xF820 | base_bits) as u16;
6659        let hw2: u16 = ((rd_bits << 12) | rm_bits) as u16;
6660        let mut bytes = hw1.to_le_bytes().to_vec();
6661        bytes.extend_from_slice(&hw2.to_le_bytes());
6662        Ok(bytes)
6663    }
6664
6665    /// Encode Thumb-2 32-bit ADD with immediate: ADD.W Rd, Rn, #imm
6666    fn encode_thumb32_add_imm(&self, rd: &Reg, rn: &Reg, imm: u32) -> Result<Vec<u8>> {
6667        let rd_bits = reg_to_bits(rd);
6668        let rn_bits = reg_to_bits(rn);
6669
6670        // For small immediates, use ADD.W Rd, Rn, #imm12
6671        // Encoding: 1111 0 i 0 1 0 0 0 S Rn | 0 imm3 Rd imm8
6672        // S = 0 (don't update flags)
6673        // The 12-bit immediate is encoded as: i:imm3:imm8
6674        // For simplicity, we only support imm <= 0xFFF (direct encoding)
6675        if imm <= 0xFFF {
6676            let i_bit = (imm >> 11) & 1;
6677            let imm3 = (imm >> 8) & 0x7;
6678            let imm8 = imm & 0xFF;
6679
6680            let hw1: u16 = (0xF100 | (i_bit << 10) | rn_bits) as u16;
6681            let hw2: u16 = ((imm3 << 12) | (rd_bits << 8) | imm8) as u16;
6682
6683            let mut bytes = hw1.to_le_bytes().to_vec();
6684            bytes.extend_from_slice(&hw2.to_le_bytes());
6685            Ok(bytes)
6686        } else {
6687            // Out-of-range immediate (> 0xFFF): materialize it into a scratch
6688            // register, then ADD.W Rd, Rn, scratch. This is the #180/#185
6689            // "encoder must produce a legal sequence, not assert" class — see #350.
6690            //
6691            // Scratch choice (must NEVER equal Rn, or Rn would be clobbered before
6692            // the ADD reads it):
6693            //   - rd != rn  => use rd itself (rn is untouched, since rd != rn).
6694            //   - rd == rn  => use R12/IP (the reserved encoder scratch). rd/rn are
6695            //                  never R12 (R12 is non-allocatable), so it can't alias.
6696            //
6697            // The materialized value is the same whether or not MOVT is emitted, so
6698            // the byte length depends only on `imm` (and rd==rn) — the size probe and
6699            // the final emit therefore agree (mandatory: the function is encoded twice).
6700            let scratch: u32 = if rd_bits == rn_bits {
6701                12 // R12/IP — in-place add, can't use rd because rd == rn
6702            } else {
6703                rd_bits // rn is preserved because rd != rn
6704            };
6705            // Invariant: the scratch must never alias Rn (would clobber it before
6706            // the ADD reads it). Unreachable in real codegen (rd/rn are never R12,
6707            // which is reserved encoder scratch), but the encoder is also driven by
6708            // the `encoder_no_panic` fuzz harness with ARBITRARY registers — incl.
6709            // rd==rn==R12, which makes scratch (R12) alias Rn. The encoder contract
6710            // (#180/#185) is Ok-or-Err, never a panic, so return a typed error
6711            // instead of asserting. #350 follow-up.
6712            if scratch == rn_bits {
6713                return Err(synth_core::Error::synthesis(format!(
6714                    "ADD #imm: cannot lower #{imm:#x} for Rd==Rn==R12 — no free scratch \
6715                     register (R12 is the reserved encoder scratch and aliases Rn here)"
6716                )));
6717            }
6718
6719            let lo16 = imm & 0xFFFF;
6720            let hi16 = (imm >> 16) & 0xFFFF;
6721
6722            let mut bytes = self.encode_thumb32_movw_raw(scratch, lo16)?;
6723            if hi16 != 0 {
6724                bytes.extend_from_slice(&self.encode_thumb32_movt_raw(scratch, hi16)?);
6725            }
6726            bytes.extend_from_slice(&self.encode_thumb32_add_reg_raw(rd_bits, rn_bits, scratch)?);
6727            Ok(bytes)
6728        }
6729    }
6730
6731    // === Raw encoding helpers for POPCNT (take register numbers directly) ===
6732
6733    /// Encode Thumb-2 32-bit MOVW (16-bit immediate) - raw version
6734    ///
6735    /// # Contract (Verus-style)
6736    /// ```text
6737    /// requires rd <= 14, imm16 <= 0xFFFF
6738    /// ensures result.len() == 4
6739    /// ```
6740    fn encode_thumb32_movw_raw(&self, rd: u32, imm16: u32) -> Result<Vec<u8>> {
6741        reg_bits_checked(rd)?;
6742        encoding_contracts::verify_imm16(imm16);
6743        // MOVW Rd, #imm16
6744        // 1111 0 i 10 0 1 0 0 imm4 | 0 imm3 Rd imm8
6745        let imm16 = imm16 & 0xFFFF;
6746        let imm4 = (imm16 >> 12) & 0xF;
6747        let i_bit = (imm16 >> 11) & 1;
6748        let imm3 = (imm16 >> 8) & 0x7;
6749        let imm8 = imm16 & 0xFF;
6750
6751        let hw1: u16 = (0xF240 | (i_bit << 10) | imm4) as u16;
6752        let hw2: u16 = ((imm3 << 12) | (rd << 8) | imm8) as u16;
6753
6754        let mut bytes = hw1.to_le_bytes().to_vec();
6755        bytes.extend_from_slice(&hw2.to_le_bytes());
6756        encoding_contracts::verify_thumb32(&bytes);
6757        Ok(bytes)
6758    }
6759
6760    /// Encode Thumb-2 32-bit MOVT (move top 16 bits) - raw version
6761    ///
6762    /// # Contract (Verus-style)
6763    /// ```text
6764    /// requires rd <= 14, imm16 <= 0xFFFF
6765    /// ensures result.len() == 4
6766    /// ```
6767    fn encode_thumb32_movt_raw(&self, rd: u32, imm16: u32) -> Result<Vec<u8>> {
6768        reg_bits_checked(rd)?;
6769        encoding_contracts::verify_imm16(imm16);
6770        // MOVT Rd, #imm16
6771        // 1111 0 i 10 1 1 0 0 imm4 | 0 imm3 Rd imm8
6772        let imm16 = imm16 & 0xFFFF;
6773        let imm4 = (imm16 >> 12) & 0xF;
6774        let i_bit = (imm16 >> 11) & 1;
6775        let imm3 = (imm16 >> 8) & 0x7;
6776        let imm8 = imm16 & 0xFF;
6777
6778        let hw1: u16 = (0xF2C0 | (i_bit << 10) | imm4) as u16;
6779        let hw2: u16 = ((imm3 << 12) | (rd << 8) | imm8) as u16;
6780
6781        let mut bytes = hw1.to_le_bytes().to_vec();
6782        bytes.extend_from_slice(&hw2.to_le_bytes());
6783        encoding_contracts::verify_thumb32(&bytes);
6784        Ok(bytes)
6785    }
6786
6787    /// Encode Thumb-2 32-bit LSR (logical shift right) with immediate - raw version
6788    fn encode_thumb32_lsr_raw(&self, rd: u32, rm: u32, shift: u32) -> Result<Vec<u8>> {
6789        // MOV.W Rd, Rm, LSR #imm
6790        // EA4F 0 imm3 Rd imm2 01 Rm
6791        let imm5 = shift & 0x1F;
6792        let imm2 = imm5 & 0x3;
6793        let imm3 = (imm5 >> 2) & 0x7;
6794
6795        let hw1: u16 = 0xEA4F;
6796        let hw2: u16 = ((imm3 << 12) | (rd << 8) | (imm2 << 6) | (0b01 << 4) | rm) as u16;
6797
6798        let mut bytes = hw1.to_le_bytes().to_vec();
6799        bytes.extend_from_slice(&hw2.to_le_bytes());
6800        Ok(bytes)
6801    }
6802
6803    /// Encode Thumb-2 32-bit AND (register) - raw version
6804    fn encode_thumb32_and_reg_raw(&self, rd: u32, rn: u32, rm: u32) -> Result<Vec<u8>> {
6805        // AND.W Rd, Rn, Rm
6806        // EA00 Rn | 0 Rd 00 00 Rm
6807        let hw1: u16 = (0xEA00 | rn) as u16;
6808        let hw2: u16 = ((rd << 8) | rm) as u16;
6809
6810        let mut bytes = hw1.to_le_bytes().to_vec();
6811        bytes.extend_from_slice(&hw2.to_le_bytes());
6812        Ok(bytes)
6813    }
6814
6815    /// Encode Thumb-2 32-bit AND with immediate - raw version
6816    fn encode_thumb32_and_imm_raw(&self, rd: u32, rn: u32, imm: u32) -> Result<Vec<u8>> {
6817        // AND.W Rd, Rn, #<modified_immediate>
6818        // For small immediates (0-255), the encoding is simpler
6819        // F0 00 Rn | 0 imm3 Rd imm8
6820        let i_bit = (imm >> 11) & 1;
6821        let imm3 = (imm >> 8) & 0x7;
6822        let imm8 = imm & 0xFF;
6823
6824        let hw1: u16 = (0xF000 | (i_bit << 10) | rn) as u16;
6825        let hw2: u16 = ((imm3 << 12) | (rd << 8) | imm8) as u16;
6826
6827        let mut bytes = hw1.to_le_bytes().to_vec();
6828        bytes.extend_from_slice(&hw2.to_le_bytes());
6829        Ok(bytes)
6830    }
6831
6832    /// Encode Thumb-2 32-bit SUB (register) - raw version
6833    fn encode_thumb32_sub_reg_raw(&self, rd: u32, rn: u32, rm: u32) -> Result<Vec<u8>> {
6834        // SUB.W Rd, Rn, Rm
6835        // EBA0 Rn | 0 Rd 00 00 Rm
6836        let hw1: u16 = (0xEBA0 | rn) as u16;
6837        let hw2: u16 = ((rd << 8) | rm) as u16;
6838
6839        let mut bytes = hw1.to_le_bytes().to_vec();
6840        bytes.extend_from_slice(&hw2.to_le_bytes());
6841        Ok(bytes)
6842    }
6843
6844    /// Encode Thumb-2 32-bit ADD (register) - raw version
6845    fn encode_thumb32_add_reg_raw(&self, rd: u32, rn: u32, rm: u32) -> Result<Vec<u8>> {
6846        // ADD.W Rd, Rn, Rm
6847        // EB00 Rn | 0 Rd 00 00 Rm
6848        let hw1: u16 = (0xEB00 | rn) as u16;
6849        let hw2: u16 = ((rd << 8) | rm) as u16;
6850
6851        let mut bytes = hw1.to_le_bytes().to_vec();
6852        bytes.extend_from_slice(&hw2.to_le_bytes());
6853        Ok(bytes)
6854    }
6855
6856    /// Encode Thumb-2 32-bit ADDS (register, flag-setting) - raw version.
6857    /// Used as the high-register fallback for `ArmOp::Adds` (i64 low-word add)
6858    /// so R8-R11 pair operands don't overflow the 16-bit field — #178/#180.
6859    fn encode_thumb32_adds_reg_raw(&self, rd: u32, rn: u32, rm: u32) -> Result<Vec<u8>> {
6860        // ADDS.W Rd, Rn, Rm (T3, S=1): EB10 Rn | 0 Rd 00 00 Rm
6861        let hw1: u16 = (0xEB10 | rn) as u16;
6862        let hw2: u16 = ((rd << 8) | rm) as u16;
6863        let mut bytes = hw1.to_le_bytes().to_vec();
6864        bytes.extend_from_slice(&hw2.to_le_bytes());
6865        Ok(bytes)
6866    }
6867
6868    /// Encode Thumb-2 32-bit SUBS (register, flag-setting) - raw version.
6869    /// High-register fallback for `ArmOp::Subs` (i64 low-word subtract) — #178/#180.
6870    fn encode_thumb32_subs_reg_raw(&self, rd: u32, rn: u32, rm: u32) -> Result<Vec<u8>> {
6871        // SUBS.W Rd, Rn, Rm (T3, S=1): EBB0 Rn | 0 Rd 00 00 Rm
6872        let hw1: u16 = (0xEBB0 | rn) as u16;
6873        let hw2: u16 = ((rd << 8) | rm) as u16;
6874        let mut bytes = hw1.to_le_bytes().to_vec();
6875        bytes.extend_from_slice(&hw2.to_le_bytes());
6876        Ok(bytes)
6877    }
6878
6879    /// Encode a sequence of ARM instructions
6880    pub fn encode_sequence(&self, ops: &[ArmOp]) -> Result<Vec<u8>> {
6881        let mut code = Vec::new();
6882
6883        for op in ops {
6884            let encoded = self.encode(op)?;
6885            code.extend_from_slice(&encoded);
6886        }
6887
6888        Ok(code)
6889    }
6890}
6891
6892/// Convert register to bit encoding (0-15)
6893/// Reverse of the ARMv7-M `ThumbExpandImm`: given a 32-bit immediate, return the
6894/// 12-bit `i:imm3:imm8` field if it is a representable modified immediate, else
6895/// `None` (the caller must materialize the value into a register). This is the
6896/// shared correct path for the data-processing immediate encoders — without it
6897/// they pack raw bits and silently mis-encode any value `> 0xFF` that isn't a
6898/// modified immediate (the silent-miscompile class behind #251/#253/#255).
6899fn try_thumb_expand_imm(value: u32) -> Option<u32> {
6900    // i:imm3 = 0000 → 8-bit value, zero-extended (00000000 00000000 00000000 XY).
6901    if value <= 0xFF {
6902        return Some(value);
6903    }
6904    let b0 = value & 0xFF; // byte 0
6905    let b1 = (value >> 8) & 0xFF; // byte 1
6906    // 0x00XY00XY (i:imm3 = 0001) — XY in bytes 0 and 2
6907    if value == (b0 << 16) | b0 {
6908        return Some(0x100 | b0);
6909    }
6910    // 0xXY00XY00 (i:imm3 = 0010) — XY in bytes 1 and 3
6911    if value == (b1 << 24) | (b1 << 8) {
6912        return Some(0x200 | b1);
6913    }
6914    // 0xXYXYXYXY (i:imm3 = 0011) — XY in all four bytes
6915    if value == (b0 << 24) | (b0 << 16) | (b0 << 8) | b0 {
6916        return Some(0x300 | b0);
6917    }
6918    // An 8-bit value with bit 7 set, rotated right by 8..=31. `rotate_left(rot)`
6919    // undoes the encoded right rotation; if the result is `1bbbbbbb` (0x80..=0xFF)
6920    // the value is representable. imm12[11:7] = rot, imm12[6:0] = low 7 bits.
6921    for rot in 8..=31u32 {
6922        let unrot = value.rotate_left(rot);
6923        if (0x80..=0xFF).contains(&unrot) {
6924            return Some((rot << 7) | (unrot & 0x7F));
6925        }
6926    }
6927    None
6928}
6929
6930/// Guard a Thumb-2 `LDR/STR Rd, [Rn, #imm12]` offset. The imm12 form supports
6931/// `0..=4095`; a larger offset must be materialized into a register by the
6932/// selector (register-offset addressing). Returning `Err` rather than silently
6933/// masking `offset & 0xFFF` closes the wrong-address miscompile class (#259,
6934/// the load/store sibling of #253/#255).
6935fn check_ldst_imm12(offset: u32) -> Result<()> {
6936    if offset > 0xFFF {
6937        Err(synth_core::Error::synthesis(
6938            "load/store immediate offset > 0xFFF (4095) — materialize the offset into a register",
6939        ))
6940    } else {
6941        Ok(())
6942    }
6943}
6944
6945fn reg_to_bits(reg: &Reg) -> u32 {
6946    match reg {
6947        Reg::R0 => 0,
6948        Reg::R1 => 1,
6949        Reg::R2 => 2,
6950        Reg::R3 => 3,
6951        Reg::R4 => 4,
6952        Reg::R5 => 5,
6953        Reg::R6 => 6,
6954        Reg::R7 => 7,
6955        Reg::R8 => 8,
6956        Reg::R9 => 9,
6957        Reg::R10 => 10,
6958        Reg::R11 => 11,
6959        Reg::R12 => 12,
6960        Reg::SP => 13,
6961        Reg::LR => 14,
6962        Reg::PC => 15,
6963    }
6964}
6965
6966/// Fallible form of the `verify_reg_bits` contract. PC (R15) is not a valid
6967/// data operand for the Thumb-2 encodings that use this guard (SDIV/UDIV/MLS/…
6968/// are UNPREDICTABLE with PC). Synth's own codegen never emits PC there, but
6969/// the encoder must stay *total* over arbitrary `ArmOp` inputs — the fuzz
6970/// harness (`encoder_no_panic`) requires Ok-or-Err, never a panic. Pre-fix, the
6971/// `debug_assert` in `verify_reg_bits` aborted under `-Cdebug-assertions`.
6972/// Returns a typed Err instead. See #185.
6973fn reg_bits_checked(bits: u32) -> Result<()> {
6974    if bits > 14 {
6975        return Err(synth_core::Error::synthesis(format!(
6976            "register bits {bits} (PC/R15) is not a valid operand for this Thumb-2 encoding"
6977        )));
6978    }
6979    Ok(())
6980}
6981
6982/// Try to encode a 32-bit value as an ARM rotated immediate (imm8 ROR 2*rot4).
6983/// Returns Some((encoded_bits, 1)) if representable, None otherwise.
6984fn try_encode_rotated_imm(val: u32) -> Option<(u32, u32)> {
6985    if val == 0 {
6986        return Some((0, 1));
6987    }
6988    for rot in 0..16u32 {
6989        let shift = rot * 2;
6990        // Rotate left by shift (undo the ROR) to see if result fits in 8 bits
6991        let unrotated = val.rotate_left(shift);
6992        if unrotated <= 0xFF {
6993            // Encoded as: rot4(4 bits) | imm8(8 bits) = rotate_imm << 8 | imm8
6994            return Some(((rot << 8) | unrotated, 1));
6995        }
6996    }
6997    None
6998}
6999
7000/// Encode operand2 field and return (bits, immediate_flag).
7001/// For ARM32 mode, immediates use the rotated-immediate encoding (imm8 ROR 2*rot4).
7002/// Panics if an immediate value cannot be represented. Callers that need large
7003/// immediates should use MOVW/MOVT instead of Operand2::Imm.
7004fn encode_operand2(op2: &Operand2) -> Result<(u32, u32)> {
7005    match op2 {
7006        Operand2::Imm(val) => {
7007            let uval = *val as u32;
7008            // Attempt rotated-immediate encoding (ARM32 Operand2)
7009            if let Some(encoded) = try_encode_rotated_imm(uval) {
7010                Ok(encoded)
7011            } else {
7012                // #378-class honesty: an immediate that can't be expressed as an
7013                // ARM32 rotated immediate is an INTERNAL selector bug — large
7014                // constants must be materialized via MOVW/MOVT, not passed here.
7015                // FAIL HONESTLY with an Err rather than silently masking to
7016                // `uval & 0xFF` and emitting a WRONG immediate. The encoder is
7017                // Ok-or-Err, never corrupt (#180/#185); a loud Err is also why
7018                // this is an Err and not a panic (the `encoder_no_panic` fuzz
7019                // contract — malformed/oversized input must degrade, not crash).
7020                Err(synth_core::Error::synthesis(format!(
7021                    "encode_operand2: immediate {uval:#x} ({val}) is not an ARM32 \
7022                     rotated immediate — the selector must materialize large \
7023                     constants via MOVW/MOVT"
7024                )))
7025            }
7026        }
7027
7028        Operand2::Reg(reg) => {
7029            let reg_bits = reg_to_bits(reg);
7030            Ok((reg_bits, 0)) // I=0 for register
7031        }
7032
7033        Operand2::RegShift {
7034            rm,
7035            shift: _,
7036            amount,
7037        } => {
7038            // Simplified encoding with shift
7039            let rm_bits = reg_to_bits(rm);
7040            let shift_bits = (*amount & 0x1F) << 7;
7041            Ok((shift_bits | rm_bits, 0))
7042        }
7043    }
7044}
7045
7046/// Encode memory address to (base_reg, offset)
7047fn encode_mem_addr(addr: &MemAddr) -> (u32, u32) {
7048    let base_bits = reg_to_bits(&addr.base);
7049    let offset_bits = (addr.offset as u32) & 0xFFF; // 12-bit offset
7050    (base_bits, offset_bits)
7051}
7052
7053/// S-register number: S0=0, S1=1, ..., S31=31
7054fn vfp_sreg_to_num(reg: &VfpReg) -> Result<u32> {
7055    match reg {
7056        VfpReg::S0 => Ok(0),
7057        VfpReg::S1 => Ok(1),
7058        VfpReg::S2 => Ok(2),
7059        VfpReg::S3 => Ok(3),
7060        VfpReg::S4 => Ok(4),
7061        VfpReg::S5 => Ok(5),
7062        VfpReg::S6 => Ok(6),
7063        VfpReg::S7 => Ok(7),
7064        VfpReg::S8 => Ok(8),
7065        VfpReg::S9 => Ok(9),
7066        VfpReg::S10 => Ok(10),
7067        VfpReg::S11 => Ok(11),
7068        VfpReg::S12 => Ok(12),
7069        VfpReg::S13 => Ok(13),
7070        VfpReg::S14 => Ok(14),
7071        VfpReg::S15 => Ok(15),
7072        VfpReg::S16 => Ok(16),
7073        VfpReg::S17 => Ok(17),
7074        VfpReg::S18 => Ok(18),
7075        VfpReg::S19 => Ok(19),
7076        VfpReg::S20 => Ok(20),
7077        VfpReg::S21 => Ok(21),
7078        VfpReg::S22 => Ok(22),
7079        VfpReg::S23 => Ok(23),
7080        VfpReg::S24 => Ok(24),
7081        VfpReg::S25 => Ok(25),
7082        VfpReg::S26 => Ok(26),
7083        VfpReg::S27 => Ok(27),
7084        VfpReg::S28 => Ok(28),
7085        VfpReg::S29 => Ok(29),
7086        VfpReg::S30 => Ok(30),
7087        VfpReg::S31 => Ok(31),
7088        // D-registers are not used in F32 single-precision encodings
7089        _ => Err(synth_core::Error::SynthesisError(
7090            "D-register not supported in single-precision VFP encoding".to_string(),
7091        )),
7092    }
7093}
7094
7095/// D-register number: D0=0, D1=1, ..., D15=15
7096fn vfp_dreg_to_num(reg: &VfpReg) -> Result<u32> {
7097    match reg {
7098        VfpReg::D0 => Ok(0),
7099        VfpReg::D1 => Ok(1),
7100        VfpReg::D2 => Ok(2),
7101        VfpReg::D3 => Ok(3),
7102        VfpReg::D4 => Ok(4),
7103        VfpReg::D5 => Ok(5),
7104        VfpReg::D6 => Ok(6),
7105        VfpReg::D7 => Ok(7),
7106        VfpReg::D8 => Ok(8),
7107        VfpReg::D9 => Ok(9),
7108        VfpReg::D10 => Ok(10),
7109        VfpReg::D11 => Ok(11),
7110        VfpReg::D12 => Ok(12),
7111        VfpReg::D13 => Ok(13),
7112        VfpReg::D14 => Ok(14),
7113        VfpReg::D15 => Ok(15),
7114        // S-registers are not used in F64 double-precision encodings
7115        _ => Err(synth_core::Error::SynthesisError(
7116            "S-register not supported in double-precision VFP encoding".to_string(),
7117        )),
7118    }
7119}
7120
7121/// Split S-register into (Vx[3:0], qualifier_bit) for VFP encoding.
7122/// For an S-register number s: Vx = s >> 1, qualifier = s & 1.
7123/// The qualifier bit goes to D (bit 22), N (bit 7), or M (bit 5) depending on role.
7124fn encode_sreg(s: u32) -> (u32, u32) {
7125    (s >> 1, s & 1)
7126}
7127
7128/// Split D-register into (Vx[3:0], qualifier_bit) for VFP double-precision encoding.
7129/// For a D-register number d: Vx = d & 0xF, qualifier = (d >> 4) & 1.
7130/// For D0-D15, qualifier is always 0.
7131fn encode_dreg(d: u32) -> (u32, u32) {
7132    (d & 0xF, (d >> 4) & 1)
7133}
7134
7135/// Encode a VFP 3-register arithmetic instruction (VADD.F32, VSUB.F32, VMUL.F32, VDIV.F32).
7136/// Returns the full 32-bit instruction word.
7137///
7138/// VFP encoding: [cond 1110] [D opc1 Vn] [Vd 101 sz] [N opc2 M 0 Vm]
7139/// For single-precision (sz=0), coprocessor = 0xA (bits[11:8]).
7140fn encode_vfp_3reg(base: u32, sd: &VfpReg, sn: &VfpReg, sm: &VfpReg) -> Result<u32> {
7141    let sd_num = vfp_sreg_to_num(sd)?;
7142    let sn_num = vfp_sreg_to_num(sn)?;
7143    let sm_num = vfp_sreg_to_num(sm)?;
7144    let (vd, d) = encode_sreg(sd_num);
7145    let (vn, n) = encode_sreg(sn_num);
7146    let (vm, m) = encode_sreg(sm_num);
7147
7148    Ok(base | (d << 22) | (vn << 16) | (vd << 12) | (n << 7) | (m << 5) | vm)
7149}
7150
7151/// Encode a VFP 2-register instruction (VNEG.F32, VABS.F32, VSQRT.F32).
7152/// Returns the full 32-bit instruction word.
7153fn encode_vfp_2reg(base: u32, sd: &VfpReg, sm: &VfpReg) -> Result<u32> {
7154    let sd_num = vfp_sreg_to_num(sd)?;
7155    let sm_num = vfp_sreg_to_num(sm)?;
7156    let (vd, d) = encode_sreg(sd_num);
7157    let (vm, m) = encode_sreg(sm_num);
7158
7159    Ok(base | (d << 22) | (vd << 12) | (m << 5) | vm)
7160}
7161
7162/// Encode a VFP load/store (VLDR.F32 / VSTR.F32).
7163/// offset is in bytes and must be word-aligned; encoded as imm8 = offset/4.
7164/// U bit (bit 23) controls add/subtract offset.
7165fn encode_vfp_ldst(base: u32, sd: &VfpReg, addr: &MemAddr) -> Result<u32> {
7166    let sd_num = vfp_sreg_to_num(sd)?;
7167    let (vd, d) = encode_sreg(sd_num);
7168    let rn = reg_to_bits(&addr.base);
7169
7170    let offset = addr.offset;
7171    let u_bit = if offset >= 0 { 1u32 } else { 0u32 };
7172    let abs_offset = offset.unsigned_abs();
7173    let imm8 = (abs_offset / 4) & 0xFF;
7174
7175    Ok(base | (u_bit << 23) | (d << 22) | (rn << 16) | (vd << 12) | imm8)
7176}
7177
7178/// Encode VMOV between core register and S-register.
7179/// VMOV Sn, Rt: 0xEE00_0A10 | (Vn << 16) | (N << 7) | (Rt << 12)
7180/// VMOV Rt, Sn: 0xEE10_0A10 | (Vn << 16) | (N << 7) | (Rt << 12)
7181fn encode_vmov_core_sreg(to_sreg: bool, sreg: &VfpReg, core: &Reg) -> Result<u32> {
7182    let s_num = vfp_sreg_to_num(sreg)?;
7183    let (vn, n) = encode_sreg(s_num);
7184    let rt = reg_to_bits(core);
7185
7186    let base = if to_sreg { 0xEE000A10 } else { 0xEE100A10 };
7187    Ok(base | (vn << 16) | (rt << 12) | (n << 7))
7188}
7189
7190/// Encode a VFP 3-register double-precision instruction (VADD.F64, VSUB.F64, etc.).
7191/// For double-precision (sz=1), coprocessor = 0xB (bits[11:8]).
7192/// The base should have bit 8 = 1 for F64 (0xB suffix instead of 0xA).
7193fn encode_vfp_3reg_f64(base: u32, dd: &VfpReg, dn: &VfpReg, dm: &VfpReg) -> Result<u32> {
7194    let dd_num = vfp_dreg_to_num(dd)?;
7195    let dn_num = vfp_dreg_to_num(dn)?;
7196    let dm_num = vfp_dreg_to_num(dm)?;
7197    let (vd, d) = encode_dreg(dd_num);
7198    let (vn, n) = encode_dreg(dn_num);
7199    let (vm, m) = encode_dreg(dm_num);
7200
7201    Ok(base | (d << 22) | (vn << 16) | (vd << 12) | (n << 7) | (m << 5) | vm)
7202}
7203
7204/// Encode a VFP 2-register double-precision instruction (VNEG.F64, VABS.F64, VSQRT.F64).
7205fn encode_vfp_2reg_f64(base: u32, dd: &VfpReg, dm: &VfpReg) -> Result<u32> {
7206    let dd_num = vfp_dreg_to_num(dd)?;
7207    let dm_num = vfp_dreg_to_num(dm)?;
7208    let (vd, d) = encode_dreg(dd_num);
7209    let (vm, m) = encode_dreg(dm_num);
7210
7211    Ok(base | (d << 22) | (vd << 12) | (m << 5) | vm)
7212}
7213
7214/// Encode a VFP load/store for double-precision (VLDR.64 / VSTR.64).
7215/// offset is in bytes and must be word-aligned; encoded as imm8 = offset/4.
7216fn encode_vfp_ldst_f64(base: u32, dd: &VfpReg, addr: &MemAddr) -> Result<u32> {
7217    let dd_num = vfp_dreg_to_num(dd)?;
7218    let (vd, d) = encode_dreg(dd_num);
7219    let rn = reg_to_bits(&addr.base);
7220
7221    let offset = addr.offset;
7222    let u_bit = if offset >= 0 { 1u32 } else { 0u32 };
7223    let abs_offset = offset.unsigned_abs();
7224    let imm8 = (abs_offset / 4) & 0xFF;
7225
7226    Ok(base | (u_bit << 23) | (d << 22) | (rn << 16) | (vd << 12) | imm8)
7227}
7228
7229/// Encode VMOV between two core registers and a D-register.
7230/// VMOV Dm, Rt, Rt2: 0xEC40_0B10 | (Rt2 << 16) | (Rt << 12) | (M << 5) | Vm
7231/// VMOV Rt, Rt2, Dm: 0xEC50_0B10 | (Rt2 << 16) | (Rt << 12) | (M << 5) | Vm
7232fn encode_vmov_core_dreg(
7233    to_dreg: bool,
7234    dreg: &VfpReg,
7235    core_lo: &Reg,
7236    core_hi: &Reg,
7237) -> Result<u32> {
7238    let d_num = vfp_dreg_to_num(dreg)?;
7239    let (vm, m) = encode_dreg(d_num);
7240    let rt = reg_to_bits(core_lo);
7241    let rt2 = reg_to_bits(core_hi);
7242
7243    let base = if to_dreg { 0xEC400B10 } else { 0xEC500B10 };
7244    Ok(base | (rt2 << 16) | (rt << 12) | (m << 5) | vm)
7245}
7246
7247/// Emit a VFP 32-bit instruction as Thumb-2 bytes (two LE halfwords).
7248fn vfp_to_thumb_bytes(instr: u32) -> Vec<u8> {
7249    let hw1 = ((instr >> 16) & 0xFFFF) as u16;
7250    let hw2 = (instr & 0xFFFF) as u16;
7251    let mut bytes = hw1.to_le_bytes().to_vec();
7252    bytes.extend_from_slice(&hw2.to_le_bytes());
7253    bytes
7254}
7255
7256// ============================================================================
7257// Helium MVE encoding helpers
7258// ============================================================================
7259
7260/// Q-register number: Q0=0, Q1=1, ..., Q7=7
7261fn qreg_to_num(reg: &QReg) -> u32 {
7262    match reg {
7263        QReg::Q0 => 0,
7264        QReg::Q1 => 1,
7265        QReg::Q2 => 2,
7266        QReg::Q3 => 3,
7267        QReg::Q4 => 4,
7268        QReg::Q5 => 5,
7269        QReg::Q6 => 6,
7270        QReg::Q7 => 7,
7271    }
7272}
7273
7274/// MVE element size to encoding bits: S8=0b00, S16=0b01, S32=0b10
7275fn mve_size_bits(size: &MveSize) -> u32 {
7276    match size {
7277        MveSize::S8 => 0b00,
7278        MveSize::S16 => 0b01,
7279        MveSize::S32 => 0b10,
7280    }
7281}
7282
7283/// Encode MVE 3-register instruction.
7284/// Q-registers are encoded as D-register pairs: Q0=D0:D1, Q1=D2:D3, etc.
7285/// In NEON/MVE encoding, the Q-register uses D-register number = Qn * 2.
7286fn encode_mve_3reg(base: u32, qd: &QReg, qn: &QReg, qm: &QReg) -> u32 {
7287    let d = qreg_to_num(qd) * 2;
7288    let n = qreg_to_num(qn) * 2;
7289    let m = qreg_to_num(qm) * 2;
7290
7291    // Standard NEON/MVE 3-register encoding:
7292    // D bit (bit 22) = Vd[4], Vd[3:0] = bits [15:12]
7293    // N bit (bit 7)  = Vn[4], Vn[3:0] = bits [19:16]
7294    // M bit (bit 5)  = Vm[4], Vm[3:0] = bits [3:0]
7295    let vd = d & 0xF;
7296    let d_bit = (d >> 4) & 1;
7297    let vn = n & 0xF;
7298    let n_bit = (n >> 4) & 1;
7299    let vm = m & 0xF;
7300    let m_bit = (m >> 4) & 1;
7301
7302    base | (d_bit << 22) | (vn << 16) | (vd << 12) | (n_bit << 7) | (m_bit << 5) | vm
7303}
7304
7305/// Encode MVE 3-register bitwise instruction (VAND, VORR, VEOR, VBIC).
7306fn encode_mve_3reg_bitwise(base: u32, qd: &QReg, qn: &QReg, qm: &QReg) -> u32 {
7307    encode_mve_3reg(base, qd, qn, qm)
7308}
7309
7310/// Encode MVE VLDRW.32 Qd, [Rn, #offset]
7311/// Format: EC9x xxxx - contiguous load, word-sized elements
7312fn encode_mve_vldrw(qd: &QReg, addr: &MemAddr) -> u32 {
7313    let qd_enc = qreg_to_num(qd) * 2;
7314    let rn = reg_to_bits(&addr.base);
7315    let offset = addr.offset;
7316    let u_bit = if offset >= 0 { 1u32 } else { 0u32 };
7317    let abs_offset = offset.unsigned_abs();
7318    let imm7 = (abs_offset / 4) & 0x7F; // 7-bit word-aligned offset
7319
7320    // VLDRW.32 Qd, [Rn, #imm]: ED10 xx80 variant
7321    0xED100E80
7322        | (u_bit << 23)
7323        | ((qd_enc >> 4) << 22)
7324        | (rn << 16)
7325        | ((qd_enc & 0xF) << 12)
7326        | (imm7 & 0x7F)
7327}
7328
7329/// Encode MVE VSTRW.32 Qd, [Rn, #offset]
7330fn encode_mve_vstrw(qd: &QReg, addr: &MemAddr) -> u32 {
7331    let qd_enc = qreg_to_num(qd) * 2;
7332    let rn = reg_to_bits(&addr.base);
7333    let offset = addr.offset;
7334    let u_bit = if offset >= 0 { 1u32 } else { 0u32 };
7335    let abs_offset = offset.unsigned_abs();
7336    let imm7 = (abs_offset / 4) & 0x7F;
7337
7338    0xED000E80
7339        | (u_bit << 23)
7340        | ((qd_enc >> 4) << 22)
7341        | (rn << 16)
7342        | ((qd_enc & 0xF) << 12)
7343        | (imm7 & 0x7F)
7344}
7345
7346impl ArmEncoder {
7347    /// Encode MVE constant load: MOVW+MOVT+VMOV for each 32-bit word, then assemble Q-register
7348    fn encode_thumb_mve_const(&self, qd: &QReg, bytes: &[u8; 16]) -> Result<Vec<u8>> {
7349        let mut result = Vec::new();
7350        let qd_num = qreg_to_num(qd);
7351
7352        // Load each 32-bit word into R12 (temp) then VMOV into S-register
7353        for i in 0..4 {
7354            let word = u32::from_le_bytes([
7355                bytes[i * 4],
7356                bytes[i * 4 + 1],
7357                bytes[i * 4 + 2],
7358                bytes[i * 4 + 3],
7359            ]);
7360            let lo16 = word & 0xFFFF;
7361            let hi16 = (word >> 16) & 0xFFFF;
7362
7363            // MOVW R12, #lo16
7364            result.extend_from_slice(&self.encode_thumb32_movw_raw(12, lo16)?);
7365            // MOVT R12, #hi16
7366            if hi16 != 0 {
7367                result.extend_from_slice(&self.encode_thumb32_movt_raw(12, hi16)?);
7368            }
7369
7370            // VMOV Sn, R12 where Sn = Qd*4 + i
7371            let s_num = qd_num * 4 + i as u32;
7372            let (vn, n) = encode_sreg(s_num);
7373            let vmov: u32 = 0xEE000A10 | (vn << 16) | (12 << 12) | (n << 7);
7374            result.extend_from_slice(&vfp_to_thumb_bytes(vmov));
7375        }
7376
7377        Ok(result)
7378    }
7379
7380    /// Encode lane-wise f32 binary operation (VDIV, etc.) via S-register extraction
7381    fn encode_thumb_mve_lane_wise_f32_binop(
7382        &self,
7383        qd: &QReg,
7384        qn: &QReg,
7385        qm: &QReg,
7386        vfp_base: u32,
7387    ) -> Result<Vec<u8>> {
7388        let mut result = Vec::new();
7389        let qd_num = qreg_to_num(qd);
7390        let qn_num = qreg_to_num(qn);
7391        let qm_num = qreg_to_num(qm);
7392
7393        // For each lane 0..3: use S-registers directly (Q aliasing)
7394        for i in 0..4u32 {
7395            let sd = qd_num * 4 + i;
7396            let sn = qn_num * 4 + i;
7397            let sm = qm_num * 4 + i;
7398
7399            let (vd, d) = encode_sreg(sd);
7400            let (vn, n) = encode_sreg(sn);
7401            let (vm, m) = encode_sreg(sm);
7402
7403            let instr = vfp_base | (d << 22) | (vn << 16) | (vd << 12) | (n << 7) | (m << 5) | vm;
7404            result.extend_from_slice(&vfp_to_thumb_bytes(instr));
7405        }
7406
7407        Ok(result)
7408    }
7409
7410    /// Encode lane-wise f32 VSQRT via S-register extraction
7411    fn encode_thumb_mve_lane_wise_f32_sqrt(&self, qd: &QReg, qm: &QReg) -> Result<Vec<u8>> {
7412        let mut result = Vec::new();
7413        let qd_num = qreg_to_num(qd);
7414        let qm_num = qreg_to_num(qm);
7415
7416        // VSQRT.F32 base: 0xEEB10AC0
7417        for i in 0..4u32 {
7418            let sd = qd_num * 4 + i;
7419            let sm = qm_num * 4 + i;
7420
7421            let (vd, d) = encode_sreg(sd);
7422            let (vm, m) = encode_sreg(sm);
7423
7424            let instr: u32 = 0xEEB10AC0 | (d << 22) | (vd << 12) | (m << 5) | vm;
7425            result.extend_from_slice(&vfp_to_thumb_bytes(instr));
7426        }
7427
7428        Ok(result)
7429    }
7430}
7431
7432#[cfg(test)]
7433mod tests {
7434    use super::*;
7435
7436    #[test]
7437    fn test_encoder_creation() {
7438        let encoder_arm = ArmEncoder::new_arm32();
7439        assert!(!encoder_arm.thumb_mode);
7440
7441        let encoder_thumb = ArmEncoder::new_thumb2();
7442        assert!(encoder_thumb.thumb_mode);
7443    }
7444
7445    /// #204 WAKE-path regression: `SetCond` materialized 0/1 with the 16-bit
7446    /// `MOVS Rd,#imm` (T1), whose Rd field is 3 bits (R0–R7). For a high Rd
7447    /// (R8–R12) `rd_bits << 8` overflows bit 11, flipping the opcode MOVS→CMP
7448    /// (`0x2c00`), so the boolean was never written — gale's `has_waiter` kept a
7449    /// stale value and the binary-sem WAKE dispatch read garbage. High Rd must
7450    /// use the 32-bit `MOV.W` (T2). Verify the bytes, not the IR.
7451    /// #311: the SAME high-Rd MOVS→CMP transmutation as #204, but in the
7452    /// i64 comparison expansions (I64SetCond / I64SetCondZ) — missed by the
7453    /// #204 hardening. With rd=R8 the boolean died in the flags
7454    /// (`ite eq; cmpeq r0,#1; cmpne r0,#0`), so gale's packed-u64 select
7455    /// read a stale register on silicon. High Rd must take MOV.W / CMP.W.
7456    #[test]
7457    fn test_encode_i64setcond_high_reg_uses_mov_w_311() {
7458        use synth_synthesis::{ArmOp, Condition, Reg};
7459        let enc = ArmEncoder::new_thumb2();
7460        let bytes = enc
7461            .encode(&ArmOp::I64SetCond {
7462                rd: Reg::R8,
7463                rn_lo: Reg::R2,
7464                rn_hi: Reg::R3,
7465                rm_lo: Reg::R6,
7466                rm_hi: Reg::R7,
7467                cond: Condition::EQ,
7468            })
7469            .unwrap();
7470        // The 32-bit MOV.W immediate (T2) first halfword is 0xF04F; the
7471        // 16-bit transmuted forms would contain 0x2801/0x2800 (CMP r0,#1/#0).
7472        let halfwords: Vec<u16> = bytes
7473            .chunks(2)
7474            .map(|c| u16::from_le_bytes([c[0], c[1]]))
7475            .collect();
7476        assert!(
7477            halfwords.iter().filter(|&&h| h == 0xF04F).count() == 2,
7478            "high rd must use two MOV.W (T2) encodings, got {halfwords:04x?}"
7479        );
7480        assert!(
7481            !halfwords.contains(&0x2801) && !halfwords.contains(&0x2800),
7482            "no transmuted 16-bit CMP imm: {halfwords:04x?}"
7483        );
7484
7485        let bytes_z = enc
7486            .encode(&ArmOp::I64SetCondZ {
7487                rd: Reg::R8,
7488                rn_lo: Reg::R2,
7489                rn_hi: Reg::R3,
7490            })
7491            .unwrap();
7492        let hw_z: Vec<u16> = bytes_z
7493            .chunks(2)
7494            .map(|c| u16::from_le_bytes([c[0], c[1]]))
7495            .collect();
7496        assert!(
7497            hw_z.iter().filter(|&&h| h == 0xF04F).count() == 2,
7498            "SetCondZ high rd MOV.W: {hw_z:04x?}"
7499        );
7500        // CMP.W rd,#0 (T2) first halfword: 0xF1B0 | rd
7501        assert!(
7502            hw_z.contains(&(0xF1B0 | 8)),
7503            "SetCondZ high rd must use CMP.W: {hw_z:04x?}"
7504        );
7505    }
7506
7507    #[test]
7508    fn test_encode_setcond_high_reg_uses_mov_w_204() {
7509        use synth_synthesis::{ArmOp, Condition, Reg};
7510        let enc = ArmEncoder::new_thumb2();
7511        // R12 (high): must be ITE + MOV.W #1 + MOV.W #0, never a 16-bit MOVS/CMP.
7512        let hi = enc
7513            .encode(&ArmOp::SetCond {
7514                rd: Reg::R12,
7515                cond: Condition::NE,
7516            })
7517            .unwrap();
7518        assert_eq!(hi.len(), 10, "ITE(2) + MOV.W(4) + MOV.W(4): {hi:02x?}");
7519        // both value halfwords are MOV.W (0xF04F) — NOT the corrupt CMP (0x2c..).
7520        assert_eq!(&hi[2..4], &[0x4F, 0xF0], "then = MOV.W: {hi:02x?}");
7521        assert_eq!(&hi[6..8], &[0x4F, 0xF0], "else = MOV.W: {hi:02x?}");
7522        assert_eq!(hi[4] & 0x0F, 0x01, "then imm = #1");
7523        assert_eq!(hi[8] & 0x0F, 0x00, "else imm = #0");
7524        // Low Rd keeps the compact 16-bit MOVS form.
7525        let lo = enc
7526            .encode(&ArmOp::SetCond {
7527                rd: Reg::R0,
7528                cond: Condition::NE,
7529            })
7530            .unwrap();
7531        assert_eq!(lo.len(), 6, "ITE(2) + MOVS(2) + MOVS(2): {lo:02x?}");
7532        assert_eq!(lo[2..4], [0x01, 0x20], "then = MOVS R0,#1");
7533        assert_eq!(lo[4..6], [0x00, 0x20], "else = MOVS R0,#0");
7534    }
7535
7536    /// #209 Opt 1b: UMULL RdLo, RdHi, Rn, Rm encodes correctly on both ISAs.
7537    /// Thumb-2 T1: 1111 1011 1010 Rn | RdLo RdHi 0000 Rm.
7538    /// A32:        cond 0000 1000 RdHi RdLo Rm 1001 Rn.
7539    #[test]
7540    fn test_encode_umull_209b() {
7541        use synth_synthesis::{ArmOp, Reg};
7542        let op = ArmOp::Umull {
7543            rdlo: Reg::R4,
7544            rdhi: Reg::R5,
7545            rn: Reg::R0,
7546            rm: Reg::R3,
7547        };
7548        // Thumb-2: hw1 = 0xFBA0 | 0 = 0xFBA0; hw2 = (4<<12)|(5<<8)|3 = 0x4503.
7549        let t = ArmEncoder::new_thumb2().encode(&op).unwrap();
7550        assert_eq!(
7551            t,
7552            vec![0xA0, 0xFB, 0x03, 0x45],
7553            "umull r4,r5,r0,r3 (T2): {t:02x?}"
7554        );
7555        // A32: 0xE0800090 | (5<<16) | (4<<12) | (3<<8) | 0 = 0xE0854390.
7556        let a = ArmEncoder::new_arm32().encode(&op).unwrap();
7557        assert_eq!(
7558            a,
7559            0xE085_4390u32.to_le_bytes().to_vec(),
7560            "umull (A32): {a:02x?}"
7561        );
7562    }
7563
7564    /// #206 regression: the ARM32 (A32) `Ldr`/`Str` encoders fed `addr` through
7565    /// `encode_mem_addr`, which returns only the 12-bit immediate — so a register
7566    /// offset (`[rn, rm, #off]`) was silently dropped to `[rn, #off]`, sending
7567    /// the access to the wrong runtime address (silent miscompile on the default
7568    /// `--target arm`). A register offset must materialize `ip = rn + rm` and
7569    /// load from `[ip, #off]`. Verify the bytes.
7570    #[test]
7571    fn test_encode_arm32_indexed_load_keeps_index_206() {
7572        use synth_synthesis::{ArmOp, MemAddr, Reg};
7573        let enc = ArmEncoder::new_arm32();
7574        // ldr r0, [r11, r1, #8]  must NOT collapse to a single immediate ldr.
7575        let bytes = enc
7576            .encode(&ArmOp::Ldr {
7577                rd: Reg::R0,
7578                addr: MemAddr::reg_imm(Reg::R11, Reg::R1, 8),
7579            })
7580            .unwrap();
7581        assert_eq!(
7582            bytes.len(),
7583            8,
7584            "expected ADD ip + LDR (2 words): {bytes:02x?}"
7585        );
7586        let add = u32::from_le_bytes(bytes[0..4].try_into().unwrap());
7587        let ldr = u32::from_le_bytes(bytes[4..8].try_into().unwrap());
7588        // ADD ip, r11, r1  = 0xE08BC001
7589        assert_eq!(add, 0xE08B_C001, "ADD ip,r11,r1: {add:#010x}");
7590        // LDR r0, [ip, #8] = 0xE59C0008
7591        assert_eq!(ldr, 0xE59C_0008, "LDR r0,[ip,#8]: {ldr:#010x}");
7592        // A bare immediate ldr (the bug) would be 0xE59B0008 (base=r11) — reject.
7593        assert_ne!(ldr, 0xE59B_0008, "index must not be dropped");
7594    }
7595
7596    /// #178/#180 regression: the Thumb `Add`/`Adds`/`Subs` reg-forms used the
7597    /// 16-bit encoding unconditionally. For high registers (R12 base scratch,
7598    /// R8-R11 i64 pairs) the 3-bit register fields overflow and corrupt the
7599    /// operands — `add ip,ip,r0` came out as `adds r4,r5,r1` (0x186C), silently
7600    /// dropping the address operand and miscompiling every optimized memory
7601    /// access. High registers must use the 32-bit `.W` forms.
7602    #[test]
7603    fn test_encode_thumb_add_high_reg_uses_add_w_178_180() {
7604        let encoder = ArmEncoder::new_thumb2();
7605
7606        // add ip, ip, r0  — the exact MemLoad/MemStore base+addr op.
7607        let code = encoder
7608            .encode(&ArmOp::Add {
7609                rd: Reg::R12,
7610                rn: Reg::R12,
7611                op2: Operand2::Reg(Reg::R0),
7612            })
7613            .unwrap();
7614        // ADD.W ip, ip, r0 = EB0C 0C00 (little-endian halfwords).
7615        assert_eq!(
7616            code,
7617            vec![0x0C, 0xEB, 0x00, 0x0C],
7618            "high-reg Thumb ADD must be 32-bit ADD.W (EB0C 0C00), not corrupt 16-bit; got {code:02X?}"
7619        );
7620        // Must NOT be the buggy 16-bit 0x186C (`adds r4,r5,r1`).
7621        assert_ne!(code, vec![0x6C, 0x18], "regressed to corrupt 16-bit ADDS");
7622
7623        // Low-register add stays 16-bit (no regression for the common case).
7624        let lo = encoder
7625            .encode(&ArmOp::Add {
7626                rd: Reg::R1,
7627                rn: Reg::R2,
7628                op2: Operand2::Reg(Reg::R3),
7629            })
7630            .unwrap();
7631        assert_eq!(
7632            lo.len(),
7633            2,
7634            "low-reg ADD should remain 16-bit, got {lo:02X?}"
7635        );
7636    }
7637
7638    /// #178/#180 sibling: i64 low-word `Adds`/`Subs` can land in R8-R11 pairs;
7639    /// those must fall back to 32-bit ADDS.W/SUBS.W (flag-setting preserved).
7640    #[test]
7641    fn test_encode_thumb_adds_subs_high_reg_use_32bit_178_180() {
7642        let encoder = ArmEncoder::new_thumb2();
7643
7644        // adds r10, r10, r8  → ADDS.W = EB1A 0A08
7645        let adds = encoder
7646            .encode(&ArmOp::Adds {
7647                rd: Reg::R10,
7648                rn: Reg::R10,
7649                op2: Operand2::Reg(Reg::R8),
7650            })
7651            .unwrap();
7652        assert_eq!(
7653            adds,
7654            vec![0x1A, 0xEB, 0x08, 0x0A],
7655            "high-reg ADDS must be 32-bit ADDS.W (EB1A 0A08); got {adds:02X?}"
7656        );
7657
7658        // subs r10, r10, r8  → SUBS.W = EBBA 0A08
7659        let subs = encoder
7660            .encode(&ArmOp::Subs {
7661                rd: Reg::R10,
7662                rn: Reg::R10,
7663                op2: Operand2::Reg(Reg::R8),
7664            })
7665            .unwrap();
7666        assert_eq!(
7667            subs,
7668            vec![0xBA, 0xEB, 0x08, 0x0A],
7669            "high-reg SUBS must be 32-bit SUBS.W (EBBA 0A08); got {subs:02X?}"
7670        );
7671    }
7672
7673    /// #184 (sibling of #180): 16-bit CMN (T1) only encodes R0-R7. High registers
7674    /// must use 32-bit CMN.W, not the corrupt truncated 16-bit form.
7675    #[test]
7676    fn test_encode_thumb_cmn_high_reg_uses_cmn_w_184() {
7677        let encoder = ArmEncoder::new_thumb2();
7678
7679        // cmn r10, r8  → CMN.W = EB1A 0F08 (ADD.W S=1, Rd=PC discarded).
7680        let cmn = encoder
7681            .encode(&ArmOp::Cmn {
7682                rn: Reg::R10,
7683                op2: Operand2::Reg(Reg::R8),
7684            })
7685            .unwrap();
7686        assert_eq!(
7687            cmn,
7688            vec![0x1A, 0xEB, 0x08, 0x0F],
7689            "high-reg CMN must be 32-bit CMN.W (EB1A 0F08); got {cmn:02X?}"
7690        );
7691
7692        // Low registers stay 16-bit: cmn r1, r2 = 0x42D1.
7693        let lo = encoder
7694            .encode(&ArmOp::Cmn {
7695                rn: Reg::R1,
7696                op2: Operand2::Reg(Reg::R2),
7697            })
7698            .unwrap();
7699        assert_eq!(
7700            lo.len(),
7701            2,
7702            "low-reg CMN should remain 16-bit, got {lo:02X?}"
7703        );
7704        assert_eq!(lo, vec![0xD1, 0x42], "low-reg CMN bytes wrong: {lo:02X?}");
7705    }
7706
7707    /// #185 regression: feeding PC (R15) as a data operand to a Thumb-2 op that
7708    /// guards its registers must return Err, not panic under debug-assertions.
7709    /// (Synth never emits PC here; the fuzz harness requires encode() be total.)
7710    #[test]
7711    fn test_encode_pc_operand_returns_err_not_panic_185() {
7712        let encoder = ArmEncoder::new_thumb2();
7713        for op in [
7714            ArmOp::Sdiv {
7715                rd: Reg::PC,
7716                rn: Reg::R0,
7717                rm: Reg::R1,
7718            },
7719            ArmOp::Udiv {
7720                rd: Reg::R0,
7721                rn: Reg::PC,
7722                rm: Reg::R1,
7723            },
7724            ArmOp::Sdiv {
7725                rd: Reg::R0,
7726                rn: Reg::R1,
7727                rm: Reg::PC,
7728            },
7729        ] {
7730            let r = encoder.encode(&op);
7731            assert!(
7732                r.is_err(),
7733                "encode({op:?}) must return Err for a PC operand, got {r:?}"
7734            );
7735        }
7736        // Valid registers still encode fine (no false rejection).
7737        assert!(
7738            encoder
7739                .encode(&ArmOp::Sdiv {
7740                    rd: Reg::R0,
7741                    rn: Reg::R1,
7742                    rm: Reg::R2
7743                })
7744                .is_ok()
7745        );
7746    }
7747
7748    #[test]
7749    fn test_encode_nop_arm32() {
7750        let encoder = ArmEncoder::new_arm32();
7751        let code = encoder.encode(&ArmOp::Nop).unwrap();
7752
7753        assert_eq!(code.len(), 4); // ARM32 instructions are 4 bytes
7754        assert_eq!(code, vec![0x00, 0x00, 0xA0, 0xE1]); // MOV R0, R0
7755    }
7756
7757    #[test]
7758    fn test_encode_nop_thumb() {
7759        let encoder = ArmEncoder::new_thumb2();
7760        let code = encoder.encode(&ArmOp::Nop).unwrap();
7761
7762        assert_eq!(code.len(), 2); // Thumb instructions are 2 bytes
7763        assert_eq!(code, vec![0x00, 0xBF]); // NOP
7764    }
7765
7766    #[test]
7767    fn test_encode_mov_immediate_arm32() {
7768        let encoder = ArmEncoder::new_arm32();
7769        let op = ArmOp::Mov {
7770            rd: Reg::R0,
7771            op2: Operand2::Imm(42),
7772        };
7773
7774        let code = encoder.encode(&op).unwrap();
7775        assert_eq!(code.len(), 4);
7776
7777        // Verify it's a MOV instruction (bits should have immediate flag set)
7778        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7779        assert_eq!(instr & 0x0E000000, 0x02000000); // Check I bit is set
7780    }
7781
7782    #[test]
7783    fn test_encode_add_registers_arm32() {
7784        let encoder = ArmEncoder::new_arm32();
7785        let op = ArmOp::Add {
7786            rd: Reg::R0,
7787            rn: Reg::R1,
7788            op2: Operand2::Reg(Reg::R2),
7789        };
7790
7791        let code = encoder.encode(&op).unwrap();
7792        assert_eq!(code.len(), 4);
7793
7794        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7795        // Verify it's an ADD instruction with correct opcode
7796        assert_eq!(instr & 0x0FE00000, 0x00800000);
7797    }
7798
7799    /// #350 — `encode_thumb32_add_imm` must lower an out-of-range immediate
7800    /// (> 0xFFF) to a legal MOVW(/MOVT) + ADD.W-register sequence instead of
7801    /// erroring. The small-imm fast path (imm <= 0xFFF) stays byte-identical.
7802    #[test]
7803    fn test_encode_add_imm_large_350() {
7804        let enc = ArmEncoder::new_thumb2();
7805
7806        // --- Fast path unchanged: imm <= 0xFFF is a single 4-byte ADD.W ---
7807        let small = enc
7808            .encode_thumb32_add_imm(&Reg::R0, &Reg::R1, 0x123)
7809            .unwrap();
7810        assert_eq!(small.len(), 4, "small imm must stay a single instruction");
7811
7812        // helper: decode a Thumb-2 MOVW/MOVT halfword pair back to its imm16
7813        fn movx_imm16(b: &[u8]) -> u32 {
7814            let hw1 = u16::from_le_bytes([b[0], b[1]]) as u32;
7815            let hw2 = u16::from_le_bytes([b[2], b[3]]) as u32;
7816            let imm4 = hw1 & 0xF;
7817            let i = (hw1 >> 10) & 1;
7818            let imm3 = (hw2 >> 12) & 0x7;
7819            let imm8 = hw2 & 0xFF;
7820            (imm4 << 12) | (i << 11) | (imm3 << 8) | imm8
7821        }
7822        fn movx_rd(b: &[u8]) -> u32 {
7823            (u16::from_le_bytes([b[2], b[3]]) as u32 >> 8) & 0xF
7824        }
7825
7826        // --- rd != rn: scratch is rd. imm = 70000 = 0x11170 needs MOVW+MOVT. ---
7827        // 0x11170: lo16 = 0x1170, hi16 = 0x0001
7828        let seq = enc
7829            .encode_thumb32_add_imm(&Reg::R12, &Reg::R0, 70000)
7830            .unwrap();
7831        assert_eq!(seq.len(), 12, "MOVW + MOVT + ADD = 12 bytes");
7832        // MOVW r12, #0x1170
7833        assert_eq!(u16::from_le_bytes([seq[0], seq[1]]) & 0xFBF0, 0xF240);
7834        assert_eq!(movx_rd(&seq[0..4]), 12);
7835        assert_eq!(movx_imm16(&seq[0..4]), 0x1170);
7836        // MOVT r12, #0x0001
7837        assert_eq!(u16::from_le_bytes([seq[4], seq[5]]) & 0xFBF0, 0xF2C0);
7838        assert_eq!(movx_rd(&seq[4..8]), 12);
7839        assert_eq!(movx_imm16(&seq[4..8]), 0x0001);
7840        // ADD.W r12, r0, r12  (EB00 | rn=0 ; rd=12, rm=12)
7841        let add1 = u16::from_le_bytes([seq[8], seq[9]]) as u32;
7842        let add2 = u16::from_le_bytes([seq[10], seq[11]]) as u32;
7843        assert_eq!(add1 & 0xFFF0, 0xEB00);
7844        assert_eq!(add1 & 0xF, 0); // rn = r0
7845        assert_eq!((add2 >> 8) & 0xF, 12); // rd = r12
7846        assert_eq!(add2 & 0xF, 12); // rm = scratch = r12
7847        // The materialized scratch must reconstruct exactly 70000.
7848        assert_eq!(
7849            (movx_imm16(&seq[4..8]) << 16) | movx_imm16(&seq[0..4]),
7850            70000
7851        );
7852
7853        // --- imm <= 0xFFFF: MOVT is skipped (MOVW + ADD = 8 bytes). ---
7854        let seq16 = enc
7855            .encode_thumb32_add_imm(&Reg::R3, &Reg::R0, 0xABCD)
7856            .unwrap();
7857        assert_eq!(seq16.len(), 8, "imm <= 0xFFFF skips MOVT");
7858        assert_eq!(movx_imm16(&seq16[0..4]), 0xABCD);
7859        assert_eq!(movx_rd(&seq16[0..4]), 3); // scratch = rd = r3
7860
7861        // --- rd == rn (in-place add): scratch must be R12, not rd. ---
7862        // imm = 0x12345: lo16 = 0x2345, hi16 = 0x0001
7863        let inplace = enc
7864            .encode_thumb32_add_imm(&Reg::R5, &Reg::R5, 0x12345)
7865            .unwrap();
7866        assert_eq!(inplace.len(), 12);
7867        assert_eq!(movx_rd(&inplace[0..4]), 12, "rd==rn must use R12 scratch");
7868        assert_eq!(
7869            (movx_imm16(&inplace[4..8]) << 16) | movx_imm16(&inplace[0..4]),
7870            0x12345
7871        );
7872        // ADD.W r5, r5, r12 — rm must be the scratch (12), never rn.
7873        let ip_add2 = u16::from_le_bytes([inplace[10], inplace[11]]) as u32;
7874        assert_eq!(ip_add2 & 0xF, 12);
7875        assert_eq!((ip_add2 >> 8) & 0xF, 5);
7876    }
7877
7878    /// #350 follow-up — the `encoder_no_panic` fuzz harness drives the encoder
7879    /// with ARBITRARY registers, including the one case the in-place lowering
7880    /// cannot serve: rd==rn==R12. There the scratch (R12, the reserved encoder
7881    /// register) would alias Rn and clobber it before the ADD reads it. The
7882    /// encoder contract (#180/#185) is Ok-or-Err, never a panic — so this must
7883    /// return Err, not assert. (Real codegen never emits rd==rn==R12 because R12
7884    /// is non-allocatable; this guards only the fuzz/adversarial path.)
7885    #[test]
7886    fn test_encode_add_imm_large_rd_rn_r12_errs_not_panics_350() {
7887        let enc = ArmEncoder::new_thumb2();
7888        // Out-of-range imm with rd==rn==R12: no free scratch -> Err.
7889        let r = enc.encode_thumb32_add_imm(&Reg::R12, &Reg::R12, 70000);
7890        assert!(
7891            r.is_err(),
7892            "rd==rn==R12 with out-of-range imm must Err (no free scratch), got {r:?}"
7893        );
7894        // Small imm with rd==rn==R12 still takes the single-instruction fast path
7895        // (no scratch needed) and must succeed — the guard is scoped to the
7896        // out-of-range lowering only.
7897        let small = enc.encode_thumb32_add_imm(&Reg::R12, &Reg::R12, 0x10);
7898        assert!(small.is_ok(), "small imm needs no scratch, must stay Ok");
7899    }
7900
7901    /// #378 — `encode_operand2` (ARM32 data-processing operand) must FAIL
7902    /// HONESTLY on an immediate that is not a valid rotated immediate, rather
7903    /// than silently masking it to `imm & 0xFF` and emitting a WRONG
7904    /// instruction. `0x1FF` has 9 set bits, so it cannot come from rotating an
7905    /// 8-bit imm8 — non-encodable. Real codegen materializes large constants via
7906    /// MOVW/MOVT; this guards the encoder's Ok-or-Err contract (#180/#185)
7907    /// directly. It is an Err (not a panic) so the `encoder_no_panic` fuzz
7908    /// harness — which drives arbitrary operands — still passes.
7909    #[test]
7910    fn test_encode_operand2_non_rotatable_imm_errs_not_masks_378() {
7911        let enc = ArmEncoder::new_arm32();
7912        let bad = enc.encode(&ArmOp::Add {
7913            rd: Reg::R0,
7914            rn: Reg::R1,
7915            op2: Operand2::Imm(0x1FF),
7916        });
7917        assert!(
7918            bad.is_err(),
7919            "non-rotatable ARM32 immediate 0x1FF must Err (was silently masked \
7920             to 0xFF), got {bad:?}"
7921        );
7922        // A representable rotated immediate still encodes fine (regression guard).
7923        let ok = enc.encode(&ArmOp::Add {
7924            rd: Reg::R0,
7925            rn: Reg::R1,
7926            op2: Operand2::Imm(0xFF),
7927        });
7928        assert!(
7929            ok.is_ok(),
7930            "0xFF is a valid rotated immediate, must stay Ok"
7931        );
7932    }
7933
7934    #[test]
7935    fn test_encode_ldr_arm32() {
7936        let encoder = ArmEncoder::new_arm32();
7937        let op = ArmOp::Ldr {
7938            rd: Reg::R0,
7939            addr: MemAddr::imm(Reg::R1, 4),
7940        };
7941
7942        let code = encoder.encode(&op).unwrap();
7943        assert_eq!(code.len(), 4);
7944
7945        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7946        // Verify load bit is set
7947        assert_eq!(instr & 0x00100000, 0x00100000);
7948    }
7949
7950    #[test]
7951    fn test_encode_str_arm32() {
7952        let encoder = ArmEncoder::new_arm32();
7953        let op = ArmOp::Str {
7954            rd: Reg::R0,
7955            addr: MemAddr::imm(Reg::SP, 0),
7956        };
7957
7958        let code = encoder.encode(&op).unwrap();
7959        assert_eq!(code.len(), 4);
7960    }
7961
7962    #[test]
7963    fn test_encode_branch_arm32() {
7964        let encoder = ArmEncoder::new_arm32();
7965        let op = ArmOp::Bl {
7966            label: "main".to_string(),
7967        };
7968
7969        let code = encoder.encode(&op).unwrap();
7970        assert_eq!(code.len(), 4);
7971
7972        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
7973        // Verify BL opcode
7974        assert_eq!(instr & 0x0F000000, 0x0B000000);
7975    }
7976
7977    /// Regression test for #167 + #174: the Thumb-2 BL relocatable placeholder
7978    /// must carry a -4 addend so an R_ARM_THM_CALL nets to exactly the symbol S.
7979    /// The correct encoding is what `gas` emits for `bl <extern>`: f7ff fffe
7980    /// (hw1=0xF7FF, hw2=0xFFFE), little-endian bytes FF F7 FE FF.
7981    ///   - 0xD000 (J1=J2=0) → ~+0x600000 garbage addend: `bl c0000c` / truncated
7982    ///     to fit (#167).
7983    ///   - 0xF800 (addend 0) → lands at S+4, one instruction past the callee
7984    ///     entry (#174).
7985    ///   - 0xFFFE (addend -4) → lands at S. Correct.
7986    #[test]
7987    fn test_encode_thumb_bl_placeholder_addend_167_174() {
7988        let encoder = ArmEncoder::new_thumb2();
7989        let op = ArmOp::Bl {
7990            label: "callee".to_string(),
7991        };
7992
7993        let code = encoder.encode(&op).unwrap();
7994        assert_eq!(code.len(), 4, "Thumb-2 BL is 32-bit");
7995
7996        let hw1 = u16::from_le_bytes([code[0], code[1]]);
7997        let hw2 = u16::from_le_bytes([code[2], code[3]]);
7998        assert_eq!(hw1, 0xF7FF, "BL first halfword (matches gas `bl <extern>`)");
7999        assert_eq!(
8000            hw2, 0xFFFE,
8001            "BL second halfword must be 0xFFFE (-4 addend → nets to S), not 0xF800 (→ S+4, #174) or 0xD000 (#167)"
8002        );
8003        assert_ne!(hw2, 0xF800, "0xF800 (addend 0) lands at S+4 (#174)");
8004        assert_ne!(hw2, 0xD000, "0xD000 bakes in a ~+0x600000 addend (#167)");
8005    }
8006
8007    #[test]
8008    fn test_encode_sequence() {
8009        let encoder = ArmEncoder::new_arm32();
8010        let ops = vec![
8011            ArmOp::Mov {
8012                rd: Reg::R0,
8013                op2: Operand2::Imm(42),
8014            },
8015            ArmOp::Mov {
8016                rd: Reg::R1,
8017                op2: Operand2::Imm(10),
8018            },
8019            ArmOp::Add {
8020                rd: Reg::R2,
8021                rn: Reg::R0,
8022                op2: Operand2::Reg(Reg::R1),
8023            },
8024        ];
8025
8026        let code = encoder.encode_sequence(&ops).unwrap();
8027        assert_eq!(code.len(), 12); // 3 instructions * 4 bytes
8028    }
8029
8030    #[test]
8031    fn test_reg_to_bits() {
8032        assert_eq!(reg_to_bits(&Reg::R0), 0);
8033        assert_eq!(reg_to_bits(&Reg::R7), 7);
8034        assert_eq!(reg_to_bits(&Reg::SP), 13);
8035        assert_eq!(reg_to_bits(&Reg::LR), 14);
8036        assert_eq!(reg_to_bits(&Reg::PC), 15);
8037    }
8038
8039    #[test]
8040    fn test_encode_bitwise_operations() {
8041        let encoder = ArmEncoder::new_arm32();
8042
8043        let and_op = ArmOp::And {
8044            rd: Reg::R0,
8045            rn: Reg::R1,
8046            op2: Operand2::Reg(Reg::R2),
8047        };
8048        let and_code = encoder.encode(&and_op).unwrap();
8049        assert_eq!(and_code.len(), 4);
8050
8051        let orr_op = ArmOp::Orr {
8052            rd: Reg::R0,
8053            rn: Reg::R1,
8054            op2: Operand2::Reg(Reg::R2),
8055        };
8056        let orr_code = encoder.encode(&orr_op).unwrap();
8057        assert_eq!(orr_code.len(), 4);
8058
8059        let eor_op = ArmOp::Eor {
8060            rd: Reg::R0,
8061            rn: Reg::R1,
8062            op2: Operand2::Reg(Reg::R2),
8063        };
8064        let eor_code = encoder.encode(&eor_op).unwrap();
8065        assert_eq!(eor_code.len(), 4);
8066    }
8067
8068    // === Thumb-2 32-bit encoding tests ===
8069
8070    #[test]
8071    fn test_encode_sdiv_thumb2() {
8072        let encoder = ArmEncoder::new_thumb2();
8073        let op = ArmOp::Sdiv {
8074            rd: Reg::R0,
8075            rn: Reg::R1,
8076            rm: Reg::R2,
8077        };
8078
8079        let code = encoder.encode(&op).unwrap();
8080        assert_eq!(code.len(), 4); // 32-bit Thumb-2 instruction
8081
8082        // SDIV R0, R1, R2: 0xFB91 0xF0F2
8083        // First halfword: 0xFB90 | Rn(1) = 0xFB91
8084        // Second halfword: 0xF0F0 | Rd(0)<<8 | Rm(2) = 0xF0F2
8085        // Little-endian: [0x91, 0xFB, 0xF2, 0xF0]
8086        assert_eq!(code[0], 0x91);
8087        assert_eq!(code[1], 0xFB);
8088        assert_eq!(code[2], 0xF2);
8089        assert_eq!(code[3], 0xF0);
8090    }
8091
8092    #[test]
8093    fn test_encode_udiv_thumb2() {
8094        let encoder = ArmEncoder::new_thumb2();
8095        let op = ArmOp::Udiv {
8096            rd: Reg::R0,
8097            rn: Reg::R1,
8098            rm: Reg::R2,
8099        };
8100
8101        let code = encoder.encode(&op).unwrap();
8102        assert_eq!(code.len(), 4); // 32-bit Thumb-2 instruction
8103
8104        // UDIV R0, R1, R2: 0xFBB1 0xF0F2
8105        // Little-endian: [0xB1, 0xFB, 0xF2, 0xF0]
8106        assert_eq!(code[0], 0xB1);
8107        assert_eq!(code[1], 0xFB);
8108        assert_eq!(code[2], 0xF2);
8109        assert_eq!(code[3], 0xF0);
8110    }
8111
8112    #[test]
8113    fn test_encode_mul_thumb2() {
8114        let encoder = ArmEncoder::new_thumb2();
8115        let op = ArmOp::Mul {
8116            rd: Reg::R0,
8117            rn: Reg::R1,
8118            rm: Reg::R2,
8119        };
8120
8121        let code = encoder.encode(&op).unwrap();
8122        assert_eq!(code.len(), 4); // 32-bit Thumb-2 instruction
8123    }
8124
8125    #[test]
8126    fn test_encode_and_thumb2() {
8127        let encoder = ArmEncoder::new_thumb2();
8128        let op = ArmOp::And {
8129            rd: Reg::R0,
8130            rn: Reg::R1,
8131            op2: Operand2::Reg(Reg::R2),
8132        };
8133
8134        let code = encoder.encode(&op).unwrap();
8135        assert_eq!(code.len(), 4); // 32-bit Thumb-2 instruction
8136    }
8137
8138    #[test]
8139    fn test_encode_lsl_thumb2_low_regs() {
8140        let encoder = ArmEncoder::new_thumb2();
8141        let op = ArmOp::Lsl {
8142            rd: Reg::R0,
8143            rn: Reg::R1,
8144            shift: 5,
8145        };
8146
8147        let code = encoder.encode(&op).unwrap();
8148        assert_eq!(code.len(), 2); // 16-bit for low registers
8149    }
8150
8151    #[test]
8152    fn test_encode_clz_thumb2() {
8153        let encoder = ArmEncoder::new_thumb2();
8154        let op = ArmOp::Clz {
8155            rd: Reg::R0,
8156            rm: Reg::R1,
8157        };
8158
8159        let code = encoder.encode(&op).unwrap();
8160        assert_eq!(code.len(), 4); // 32-bit Thumb-2 instruction
8161    }
8162
8163    #[test]
8164    fn test_encode_bx_thumb2() {
8165        let encoder = ArmEncoder::new_thumb2();
8166        let op = ArmOp::Bx { rm: Reg::LR };
8167
8168        let code = encoder.encode(&op).unwrap();
8169        assert_eq!(code.len(), 2); // 16-bit instruction
8170
8171        // BX LR: 0x4770
8172        assert_eq!(code, vec![0x70, 0x47]);
8173    }
8174
8175    // ========================================================================
8176    // f32 pseudo-op encoding tests
8177    // ========================================================================
8178
8179    #[test]
8180    fn test_encode_f32_abs_arm32() {
8181        let encoder = ArmEncoder::new_arm32();
8182        let op = ArmOp::F32Abs {
8183            sd: VfpReg::S0,
8184            sm: VfpReg::S2,
8185        };
8186        let code = encoder.encode(&op).unwrap();
8187        assert_eq!(code.len(), 4); // Single VFP instruction
8188    }
8189
8190    #[test]
8191    fn test_encode_f32_neg_arm32() {
8192        let encoder = ArmEncoder::new_arm32();
8193        let op = ArmOp::F32Neg {
8194            sd: VfpReg::S0,
8195            sm: VfpReg::S2,
8196        };
8197        let code = encoder.encode(&op).unwrap();
8198        assert_eq!(code.len(), 4);
8199    }
8200
8201    #[test]
8202    fn test_encode_f32_sqrt_arm32() {
8203        let encoder = ArmEncoder::new_arm32();
8204        let op = ArmOp::F32Sqrt {
8205            sd: VfpReg::S0,
8206            sm: VfpReg::S2,
8207        };
8208        let code = encoder.encode(&op).unwrap();
8209        assert_eq!(code.len(), 4);
8210    }
8211
8212    #[test]
8213    fn test_encode_f32_ceil_arm32() {
8214        let encoder = ArmEncoder::new_arm32();
8215        let op = ArmOp::F32Ceil {
8216            sd: VfpReg::S0,
8217            sm: VfpReg::S2,
8218        };
8219        let code = encoder.encode(&op).unwrap();
8220        // VMRS + BIC + ORR + VMSR + VCVT.S32.F32 + VMRS + BIC + VMSR + VCVT.F32.S32
8221        assert_eq!(code.len(), 36);
8222    }
8223
8224    #[test]
8225    fn test_encode_f32_floor_thumb2() {
8226        let encoder = ArmEncoder::new_thumb2();
8227        let op = ArmOp::F32Floor {
8228            sd: VfpReg::S0,
8229            sm: VfpReg::S2,
8230        };
8231        let code = encoder.encode(&op).unwrap();
8232        // VMRS + BIC.W + ORR.W + VMSR + VCVT + VMRS + BIC.W + VMSR + VCVT.F32.S32
8233        assert_eq!(code.len(), 36);
8234    }
8235
8236    #[test]
8237    fn test_encode_f32_min_arm32() {
8238        let encoder = ArmEncoder::new_arm32();
8239        let op = ArmOp::F32Min {
8240            sd: VfpReg::S0,
8241            sn: VfpReg::S2,
8242            sm: VfpReg::S4,
8243        };
8244        let code = encoder.encode(&op).unwrap();
8245        assert_eq!(code.len(), 16); // VMOV + VCMP + VMRS + conditional VMOV
8246    }
8247
8248    #[test]
8249    fn test_encode_f32_max_thumb2() {
8250        let encoder = ArmEncoder::new_thumb2();
8251        let op = ArmOp::F32Max {
8252            sd: VfpReg::S0,
8253            sn: VfpReg::S2,
8254            sm: VfpReg::S4,
8255        };
8256        let code = encoder.encode(&op).unwrap();
8257        // VMOV(4) + VCMP(4) + VMRS(4) + IT(2) + VMOV(4) = 18
8258        assert_eq!(code.len(), 18);
8259    }
8260
8261    #[test]
8262    fn test_encode_f32_copysign_arm32() {
8263        let encoder = ArmEncoder::new_arm32();
8264        let op = ArmOp::F32Copysign {
8265            sd: VfpReg::S0,
8266            sn: VfpReg::S2,
8267            sm: VfpReg::S4,
8268        };
8269        let code = encoder.encode(&op).unwrap();
8270        // VMOV + VMOV + AND + BIC + ORR + VMOV = 6 * 4 = 24
8271        assert_eq!(code.len(), 24);
8272    }
8273
8274    // ========================================================================
8275    // f64 encoding tests
8276    // ========================================================================
8277
8278    #[test]
8279    fn test_encode_f64_add_arm32() {
8280        let encoder = ArmEncoder::new_arm32();
8281        let op = ArmOp::F64Add {
8282            dd: VfpReg::D0,
8283            dn: VfpReg::D1,
8284            dm: VfpReg::D2,
8285        };
8286        let code = encoder.encode(&op).unwrap();
8287        assert_eq!(code.len(), 4);
8288        // VADD.F64 D0, D1, D2: check coprocessor is cp11 (0xB)
8289        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
8290        assert_eq!((instr >> 8) & 0xF, 0xB); // cp11
8291    }
8292
8293    #[test]
8294    fn test_encode_f64_sub_thumb2() {
8295        let encoder = ArmEncoder::new_thumb2();
8296        let op = ArmOp::F64Sub {
8297            dd: VfpReg::D0,
8298            dn: VfpReg::D1,
8299            dm: VfpReg::D2,
8300        };
8301        let code = encoder.encode(&op).unwrap();
8302        assert_eq!(code.len(), 4); // 32-bit VFP as two Thumb halfwords
8303    }
8304
8305    #[test]
8306    fn test_encode_f64_mul_arm32() {
8307        let encoder = ArmEncoder::new_arm32();
8308        let op = ArmOp::F64Mul {
8309            dd: VfpReg::D0,
8310            dn: VfpReg::D1,
8311            dm: VfpReg::D2,
8312        };
8313        let code = encoder.encode(&op).unwrap();
8314        assert_eq!(code.len(), 4);
8315    }
8316
8317    #[test]
8318    fn test_encode_f64_div_arm32() {
8319        let encoder = ArmEncoder::new_arm32();
8320        let op = ArmOp::F64Div {
8321            dd: VfpReg::D0,
8322            dn: VfpReg::D1,
8323            dm: VfpReg::D2,
8324        };
8325        let code = encoder.encode(&op).unwrap();
8326        assert_eq!(code.len(), 4);
8327    }
8328
8329    #[test]
8330    fn test_encode_f64_abs_arm32() {
8331        let encoder = ArmEncoder::new_arm32();
8332        let op = ArmOp::F64Abs {
8333            dd: VfpReg::D0,
8334            dm: VfpReg::D2,
8335        };
8336        let code = encoder.encode(&op).unwrap();
8337        assert_eq!(code.len(), 4);
8338    }
8339
8340    #[test]
8341    fn test_encode_f64_neg_arm32() {
8342        let encoder = ArmEncoder::new_arm32();
8343        let op = ArmOp::F64Neg {
8344            dd: VfpReg::D0,
8345            dm: VfpReg::D2,
8346        };
8347        let code = encoder.encode(&op).unwrap();
8348        assert_eq!(code.len(), 4);
8349    }
8350
8351    #[test]
8352    fn test_encode_f64_sqrt_arm32() {
8353        let encoder = ArmEncoder::new_arm32();
8354        let op = ArmOp::F64Sqrt {
8355            dd: VfpReg::D0,
8356            dm: VfpReg::D2,
8357        };
8358        let code = encoder.encode(&op).unwrap();
8359        assert_eq!(code.len(), 4);
8360    }
8361
8362    #[test]
8363    fn test_encode_f64_load_arm32() {
8364        let encoder = ArmEncoder::new_arm32();
8365        let op = ArmOp::F64Load {
8366            dd: VfpReg::D0,
8367            addr: MemAddr::imm(Reg::R0, 8),
8368        };
8369        let code = encoder.encode(&op).unwrap();
8370        assert_eq!(code.len(), 4);
8371        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
8372        assert_eq!((instr >> 8) & 0xF, 0xB); // cp11 for F64
8373        assert_eq!(instr & 0xFF, 2); // offset 8 / 4 = 2
8374    }
8375
8376    #[test]
8377    fn test_encode_f64_store_thumb2() {
8378        let encoder = ArmEncoder::new_thumb2();
8379        let op = ArmOp::F64Store {
8380            dd: VfpReg::D0,
8381            addr: MemAddr::imm(Reg::SP, 0),
8382        };
8383        let code = encoder.encode(&op).unwrap();
8384        assert_eq!(code.len(), 4);
8385    }
8386
8387    #[test]
8388    fn test_encode_f64_compare_arm32() {
8389        let encoder = ArmEncoder::new_arm32();
8390        let op = ArmOp::F64Eq {
8391            rd: Reg::R0,
8392            dn: VfpReg::D0,
8393            dm: VfpReg::D1,
8394        };
8395        let code = encoder.encode(&op).unwrap();
8396        assert_eq!(code.len(), 16); // VCMP + VMRS + MOV #0 + MOVcond #1
8397    }
8398
8399    #[test]
8400    fn test_encode_f64_compare_thumb2() {
8401        let encoder = ArmEncoder::new_thumb2();
8402        let op = ArmOp::F64Lt {
8403            rd: Reg::R0,
8404            dn: VfpReg::D0,
8405            dm: VfpReg::D1,
8406        };
8407        let code = encoder.encode(&op).unwrap();
8408        // VCMP(4) + VMRS(4) + MOVS(2) + IT(2) + MOV(2) = 14
8409        assert_eq!(code.len(), 14);
8410    }
8411
8412    #[test]
8413    fn test_encode_f64_const_arm32() {
8414        let encoder = ArmEncoder::new_arm32();
8415        let op = ArmOp::F64Const {
8416            dd: VfpReg::D0,
8417            value: 3.125,
8418        };
8419        let code = encoder.encode(&op).unwrap();
8420        // MOVW(4) + MOVT(4) + MOVW(4) + MOVT(4) + VMOV(4) = 20
8421        assert_eq!(code.len(), 20);
8422    }
8423
8424    #[test]
8425    fn test_encode_f64_const_thumb2() {
8426        let encoder = ArmEncoder::new_thumb2();
8427        let op = ArmOp::F64Const {
8428            dd: VfpReg::D0,
8429            value: 2.5,
8430        };
8431        let code = encoder.encode(&op).unwrap();
8432        // MOVW(4) + MOVT(4) + MOVW(4) + MOVT(4) + VMOV(4) = 20
8433        assert_eq!(code.len(), 20);
8434    }
8435
8436    #[test]
8437    fn test_encode_f64_convert_i32s_arm32() {
8438        let encoder = ArmEncoder::new_arm32();
8439        let op = ArmOp::F64ConvertI32S {
8440            dd: VfpReg::D0,
8441            rm: Reg::R0,
8442        };
8443        let code = encoder.encode(&op).unwrap();
8444        // VMOV(4) + VCVT(4) = 8
8445        assert_eq!(code.len(), 8);
8446    }
8447
8448    #[test]
8449    fn test_encode_f64_promote_f32_arm32() {
8450        let encoder = ArmEncoder::new_arm32();
8451        let op = ArmOp::F64PromoteF32 {
8452            dd: VfpReg::D0,
8453            sm: VfpReg::S0,
8454        };
8455        let code = encoder.encode(&op).unwrap();
8456        assert_eq!(code.len(), 4); // Single VCVT.F64.F32 instruction
8457    }
8458
8459    #[test]
8460    fn test_encode_f64_promote_f32_thumb2() {
8461        let encoder = ArmEncoder::new_thumb2();
8462        let op = ArmOp::F64PromoteF32 {
8463            dd: VfpReg::D0,
8464            sm: VfpReg::S0,
8465        };
8466        let code = encoder.encode(&op).unwrap();
8467        assert_eq!(code.len(), 4);
8468    }
8469
8470    #[test]
8471    fn test_encode_i32_trunc_f64s_arm32() {
8472        let encoder = ArmEncoder::new_arm32();
8473        let op = ArmOp::I32TruncF64S {
8474            rd: Reg::R0,
8475            dm: VfpReg::D0,
8476        };
8477        let code = encoder.encode(&op).unwrap();
8478        // VCVT(4) + VMOV(4) = 8
8479        assert_eq!(code.len(), 8);
8480    }
8481
8482    #[test]
8483    fn test_encode_f64_reinterpret_i64_arm32() {
8484        let encoder = ArmEncoder::new_arm32();
8485        let op = ArmOp::F64ReinterpretI64 {
8486            dd: VfpReg::D0,
8487            rmlo: Reg::R0,
8488            rmhi: Reg::R1,
8489        };
8490        let code = encoder.encode(&op).unwrap();
8491        assert_eq!(code.len(), 4); // Single VMOV instruction
8492    }
8493
8494    #[test]
8495    fn test_encode_i64_reinterpret_f64_thumb2() {
8496        let encoder = ArmEncoder::new_thumb2();
8497        let op = ArmOp::I64ReinterpretF64 {
8498            rdlo: Reg::R0,
8499            rdhi: Reg::R1,
8500            dm: VfpReg::D0,
8501        };
8502        let code = encoder.encode(&op).unwrap();
8503        assert_eq!(code.len(), 4);
8504    }
8505
8506    #[test]
8507    fn test_encode_f64_trunc_thumb2() {
8508        let encoder = ArmEncoder::new_thumb2();
8509        let op = ArmOp::F64Trunc {
8510            dd: VfpReg::D0,
8511            dm: VfpReg::D1,
8512        };
8513        let code = encoder.encode(&op).unwrap();
8514        // Two VFP instructions via Thumb encoding
8515        assert_eq!(code.len(), 8);
8516    }
8517
8518    #[test]
8519    fn test_encode_f64_min_arm32() {
8520        let encoder = ArmEncoder::new_arm32();
8521        let op = ArmOp::F64Min {
8522            dd: VfpReg::D0,
8523            dn: VfpReg::D1,
8524            dm: VfpReg::D2,
8525        };
8526        let code = encoder.encode(&op).unwrap();
8527        // VMOV + VCMP + VMRS + conditional VMOV = 16
8528        assert_eq!(code.len(), 16);
8529    }
8530
8531    #[test]
8532    fn test_f64_cp11_encoding() {
8533        // Verify that F64 instructions use coprocessor 11 (0xB), not 10 (0xA)
8534        let encoder = ArmEncoder::new_arm32();
8535
8536        // F64Add
8537        let code = encoder
8538            .encode(&ArmOp::F64Add {
8539                dd: VfpReg::D0,
8540                dn: VfpReg::D0,
8541                dm: VfpReg::D0,
8542            })
8543            .unwrap();
8544        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
8545        assert_eq!((instr >> 8) & 0xF, 0xB, "F64 should use cp11");
8546
8547        // F32Add for comparison
8548        let code = encoder
8549            .encode(&ArmOp::F32Add {
8550                sd: VfpReg::S0,
8551                sn: VfpReg::S0,
8552                sm: VfpReg::S0,
8553            })
8554            .unwrap();
8555        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
8556        assert_eq!((instr >> 8) & 0xF, 0xA, "F32 should use cp10");
8557    }
8558
8559    #[test]
8560    fn test_dreg_encoding_higher_registers() {
8561        let encoder = ArmEncoder::new_arm32();
8562
8563        // Test with D15 (highest register)
8564        let op = ArmOp::F64Add {
8565            dd: VfpReg::D15,
8566            dn: VfpReg::D14,
8567            dm: VfpReg::D13,
8568        };
8569        let code = encoder.encode(&op).unwrap();
8570        assert_eq!(code.len(), 4);
8571
8572        // Verify the register encoding worked (instruction is valid)
8573        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
8574        assert_eq!((instr >> 8) & 0xF, 0xB); // cp11
8575    }
8576
8577    // ========================================================================
8578    // Control flow encoding tests
8579    // ========================================================================
8580
8581    #[test]
8582    fn test_encode_label_emits_no_bytes() {
8583        let encoder = ArmEncoder::new_thumb2();
8584        let op = ArmOp::Label {
8585            name: ".Lblock_end_0".to_string(),
8586        };
8587        let code = encoder.encode(&op).unwrap();
8588        assert!(code.is_empty(), "Label should emit zero bytes");
8589
8590        let encoder32 = ArmEncoder::new_arm32();
8591        let code32 = encoder32.encode(&op).unwrap();
8592        assert!(
8593            code32.is_empty(),
8594            "Label should emit zero bytes in ARM32 too"
8595        );
8596    }
8597
8598    #[test]
8599    fn test_encode_bcc_eq_thumb2() {
8600        use synth_synthesis::Condition;
8601        let encoder = ArmEncoder::new_thumb2();
8602        let op = ArmOp::Bcc {
8603            cond: Condition::EQ,
8604            label: "target".to_string(),
8605        };
8606        let code = encoder.encode(&op).unwrap();
8607        assert_eq!(code.len(), 2); // 16-bit conditional branch
8608
8609        // BEQ with offset 0: 0xD000 in little-endian
8610        assert_eq!(code, vec![0x00, 0xD0]);
8611    }
8612
8613    #[test]
8614    fn test_encode_bcc_ne_thumb2() {
8615        use synth_synthesis::Condition;
8616        let encoder = ArmEncoder::new_thumb2();
8617        let op = ArmOp::Bcc {
8618            cond: Condition::NE,
8619            label: "target".to_string(),
8620        };
8621        let code = encoder.encode(&op).unwrap();
8622        assert_eq!(code.len(), 2);
8623
8624        // BNE with offset 0: 0xD100 in little-endian
8625        assert_eq!(code, vec![0x00, 0xD1]);
8626    }
8627
8628    #[test]
8629    fn test_encode_bcc_arm32() {
8630        use synth_synthesis::Condition;
8631        let encoder = ArmEncoder::new_arm32();
8632        let op = ArmOp::Bcc {
8633            cond: Condition::EQ,
8634            label: "target".to_string(),
8635        };
8636        let code = encoder.encode(&op).unwrap();
8637        assert_eq!(code.len(), 4); // 32-bit ARM instruction
8638
8639        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
8640        // BEQ: cond=0x0, opcode=0xA, offset=0
8641        assert_eq!(instr & 0xF0000000, 0x00000000); // EQ condition
8642        assert_eq!(instr & 0x0F000000, 0x0A000000); // Branch opcode
8643    }
8644
8645    #[test]
8646    fn test_encode_udf_thumb2() {
8647        let encoder = ArmEncoder::new_thumb2();
8648        let op = ArmOp::Udf { imm: 0 };
8649        let code = encoder.encode(&op).unwrap();
8650        assert_eq!(code.len(), 2); // 16-bit
8651
8652        // UDF #0: 0xDE00 in little-endian
8653        assert_eq!(code, vec![0x00, 0xDE]);
8654    }
8655
8656    #[test]
8657    fn test_encode_nop_thumb2() {
8658        let encoder = ArmEncoder::new_thumb2();
8659        let op = ArmOp::Nop;
8660        let code = encoder.encode(&op).unwrap();
8661        assert_eq!(code.len(), 2); // 16-bit
8662
8663        // NOP: 0xBF00 in little-endian
8664        assert_eq!(code, vec![0x00, 0xBF]);
8665    }
8666
8667    // =========================================================================
8668    // i64 Thumb-2 encoding tests
8669    // =========================================================================
8670
8671    #[test]
8672    fn test_encode_i64_add_thumb2() {
8673        let encoder = ArmEncoder::new_thumb2();
8674        let op = ArmOp::I64Add {
8675            rdlo: Reg::R0,
8676            rdhi: Reg::R1,
8677            rnlo: Reg::R0,
8678            rnhi: Reg::R1,
8679            rmlo: Reg::R2,
8680            rmhi: Reg::R3,
8681        };
8682        let code = encoder.encode(&op).unwrap();
8683        // Should emit ADDS (2 bytes) + ADC.W (4 bytes) = 6 bytes
8684        assert_eq!(code.len(), 6, "I64Add should be 6 bytes (ADDS + ADC.W)");
8685    }
8686
8687    #[test]
8688    fn test_encode_i64_sub_thumb2() {
8689        let encoder = ArmEncoder::new_thumb2();
8690        let op = ArmOp::I64Sub {
8691            rdlo: Reg::R0,
8692            rdhi: Reg::R1,
8693            rnlo: Reg::R0,
8694            rnhi: Reg::R1,
8695            rmlo: Reg::R2,
8696            rmhi: Reg::R3,
8697        };
8698        let code = encoder.encode(&op).unwrap();
8699        // Should emit SUBS (2 bytes) + SBC.W (4 bytes) = 6 bytes
8700        assert_eq!(code.len(), 6, "I64Sub should be 6 bytes (SUBS + SBC.W)");
8701    }
8702
8703    #[test]
8704    fn test_encode_i64_and_thumb2() {
8705        let encoder = ArmEncoder::new_thumb2();
8706        let op = ArmOp::I64And {
8707            rdlo: Reg::R0,
8708            rdhi: Reg::R1,
8709            rnlo: Reg::R0,
8710            rnhi: Reg::R1,
8711            rmlo: Reg::R2,
8712            rmhi: Reg::R3,
8713        };
8714        let code = encoder.encode(&op).unwrap();
8715        // AND.W (4 bytes) + AND.W (4 bytes) = 8 bytes
8716        assert!(code.len() >= 4, "I64And should emit at least 4 bytes");
8717    }
8718
8719    #[test]
8720    fn test_encode_i64_or_thumb2() {
8721        let encoder = ArmEncoder::new_thumb2();
8722        let op = ArmOp::I64Or {
8723            rdlo: Reg::R0,
8724            rdhi: Reg::R1,
8725            rnlo: Reg::R0,
8726            rnhi: Reg::R1,
8727            rmlo: Reg::R2,
8728            rmhi: Reg::R3,
8729        };
8730        let code = encoder.encode(&op).unwrap();
8731        assert!(code.len() >= 4, "I64Or should emit at least 4 bytes");
8732    }
8733
8734    #[test]
8735    fn test_encode_i64_xor_thumb2() {
8736        let encoder = ArmEncoder::new_thumb2();
8737        let op = ArmOp::I64Xor {
8738            rdlo: Reg::R0,
8739            rdhi: Reg::R1,
8740            rnlo: Reg::R0,
8741            rnhi: Reg::R1,
8742            rmlo: Reg::R2,
8743            rmhi: Reg::R3,
8744        };
8745        let code = encoder.encode(&op).unwrap();
8746        assert!(code.len() >= 4, "I64Xor should emit at least 4 bytes");
8747    }
8748
8749    #[test]
8750    fn test_encode_i64_const_small_thumb2() {
8751        let encoder = ArmEncoder::new_thumb2();
8752        // Small constant: only needs MOVW for each half
8753        let op = ArmOp::I64Const {
8754            rdlo: Reg::R0,
8755            rdhi: Reg::R1,
8756            value: 42,
8757        };
8758        let code = encoder.encode(&op).unwrap();
8759        // MOVW R0, #42 (4 bytes) + MOVW R1, #0 (4 bytes) = 8 bytes minimum
8760        assert!(code.len() >= 8, "I64Const should emit at least 8 bytes");
8761    }
8762
8763    #[test]
8764    fn test_encode_i64_const_large_thumb2() {
8765        let encoder = ArmEncoder::new_thumb2();
8766        // Large constant: needs MOVW+MOVT for each half
8767        let op = ArmOp::I64Const {
8768            rdlo: Reg::R0,
8769            rdhi: Reg::R1,
8770            value: 0x1234_5678_9ABC_DEF0_u64 as i64,
8771        };
8772        let code = encoder.encode(&op).unwrap();
8773        // MOVW + MOVT for lo (8 bytes) + MOVW + MOVT for hi (8 bytes) = 16 bytes
8774        assert_eq!(
8775            code.len(),
8776            16,
8777            "I64Const with large value should be 16 bytes"
8778        );
8779    }
8780
8781    #[test]
8782    fn test_encode_i64_extend_i32_s_thumb2() {
8783        let encoder = ArmEncoder::new_thumb2();
8784        let op = ArmOp::I64ExtendI32S {
8785            rdlo: Reg::R0,
8786            rdhi: Reg::R1,
8787            rn: Reg::R0,
8788        };
8789        let code = encoder.encode(&op).unwrap();
8790        // When rdlo == rn, only ASR (4 bytes) is emitted
8791        assert_eq!(
8792            code.len(),
8793            4,
8794            "I64ExtendI32S (same reg) should be 4 bytes (ASR only)"
8795        );
8796    }
8797
8798    #[test]
8799    fn test_encode_i64_extend_i32_s_diff_reg_thumb2() {
8800        let encoder = ArmEncoder::new_thumb2();
8801        let op = ArmOp::I64ExtendI32S {
8802            rdlo: Reg::R0,
8803            rdhi: Reg::R1,
8804            rn: Reg::R2,
8805        };
8806        let code = encoder.encode(&op).unwrap();
8807        // MOV rdlo, rn (2 bytes for low regs) + ASR rdhi, rdlo, #31 (4 bytes) = 6 bytes
8808        assert!(
8809            code.len() >= 6,
8810            "I64ExtendI32S (diff reg) should be at least 6 bytes"
8811        );
8812    }
8813
8814    #[test]
8815    fn test_encode_i64_extend_i32_u_thumb2() {
8816        let encoder = ArmEncoder::new_thumb2();
8817        let op = ArmOp::I64ExtendI32U {
8818            rdlo: Reg::R0,
8819            rdhi: Reg::R1,
8820            rn: Reg::R0,
8821        };
8822        let code = encoder.encode(&op).unwrap();
8823        // When rdlo == rn, only MOV rdhi, #0 (2 bytes) is emitted
8824        assert_eq!(
8825            code.len(),
8826            2,
8827            "I64ExtendI32U (same reg) should be 2 bytes (MOV #0 only)"
8828        );
8829    }
8830
8831    #[test]
8832    fn test_encode_i32_wrap_i64_nop_thumb2() {
8833        let encoder = ArmEncoder::new_thumb2();
8834        // When rd == rnlo, should be a NOP
8835        let op = ArmOp::I32WrapI64 {
8836            rd: Reg::R0,
8837            rnlo: Reg::R0,
8838        };
8839        let code = encoder.encode(&op).unwrap();
8840        assert_eq!(code.len(), 2, "I32WrapI64 same reg should be NOP (2 bytes)");
8841        assert_eq!(code, vec![0x00, 0xBF]); // NOP
8842    }
8843
8844    #[test]
8845    fn test_encode_i32_wrap_i64_diff_reg_thumb2() {
8846        let encoder = ArmEncoder::new_thumb2();
8847        let op = ArmOp::I32WrapI64 {
8848            rd: Reg::R2,
8849            rnlo: Reg::R0,
8850        };
8851        let code = encoder.encode(&op).unwrap();
8852        // MOV R2, R0 (2 or 4 bytes)
8853        assert!(
8854            code.len() >= 2,
8855            "I32WrapI64 diff reg should emit at least 2 bytes"
8856        );
8857    }
8858
8859    #[test]
8860    fn test_encode_i64_eqz_thumb2() {
8861        let encoder = ArmEncoder::new_thumb2();
8862        let op = ArmOp::I64Eqz {
8863            rd: Reg::R0,
8864            rnlo: Reg::R0,
8865            rnhi: Reg::R1,
8866        };
8867        let code = encoder.encode(&op).unwrap();
8868        // Delegates to I64SetCondZ which is already encoded
8869        assert!(
8870            code.len() >= 6,
8871            "I64Eqz should emit at least 6 bytes for ORR+ITE+MOV+MOV"
8872        );
8873    }
8874
8875    #[test]
8876    fn test_encode_i64_eq_thumb2() {
8877        let encoder = ArmEncoder::new_thumb2();
8878        let op = ArmOp::I64Eq {
8879            rd: Reg::R0,
8880            rnlo: Reg::R0,
8881            rnhi: Reg::R1,
8882            rmlo: Reg::R2,
8883            rmhi: Reg::R3,
8884        };
8885        let code = encoder.encode(&op).unwrap();
8886        // Delegates to I64SetCond EQ: CMP lo + IT EQ + CMPEQ hi + ITE EQ + MOV 1 + MOV 0
8887        assert!(code.len() >= 10, "I64Eq should emit at least 10 bytes");
8888    }
8889
8890    #[test]
8891    fn test_encode_i64_ldr_thumb2() {
8892        let encoder = ArmEncoder::new_thumb2();
8893        let op = ArmOp::I64Ldr {
8894            rdlo: Reg::R0,
8895            rdhi: Reg::R1,
8896            addr: MemAddr::imm(Reg::SP, 0),
8897        };
8898        let code = encoder.encode(&op).unwrap();
8899        // Two LDR instructions (lo at offset, hi at offset+4)
8900        assert!(code.len() >= 4, "I64Ldr should emit at least 4 bytes");
8901    }
8902
8903    #[test]
8904    fn test_372_i64_ldr_indexed_materializes_address() {
8905        // #372: a memory i64.load carries an index register (R11 + addr + off).
8906        // The encoder must materialize `ip = base + index` (ADD.W) and load via
8907        // `[ip,#off]` — NOT drop the index. A frame (non-indexed) i64.load must
8908        // stay byte-identical (plain `[base,#off]`, no ADD).
8909        let encoder = ArmEncoder::new_thumb2();
8910        let indexed = encoder
8911            .encode(&ArmOp::I64Ldr {
8912                rdlo: Reg::R0,
8913                rdhi: Reg::R1,
8914                addr: MemAddr::reg_imm(Reg::R11, Reg::R0, 0),
8915            })
8916            .unwrap();
8917        // ADD.W ip, fp, r0 = eb0b 0c00 (byte-verified vs arm-none-eabi-as).
8918        assert_eq!(
8919            &indexed[0..4],
8920            &[0x0b, 0xeb, 0x00, 0x0c],
8921            "indexed I64Ldr must start with ADD.W ip, base, index"
8922        );
8923        let frame = encoder
8924            .encode(&ArmOp::I64Ldr {
8925                rdlo: Reg::R0,
8926                rdhi: Reg::R1,
8927                addr: MemAddr::imm(Reg::SP, 8),
8928            })
8929            .unwrap();
8930        // No index -> no ADD.W prefix (byte-identical frame access).
8931        assert_ne!(
8932            &frame[0..2],
8933            &[0x0b, 0xeb],
8934            "frame (non-indexed) I64Ldr must NOT emit an ADD.W"
8935        );
8936    }
8937
8938    #[test]
8939    fn test_encode_i64_str_thumb2() {
8940        let encoder = ArmEncoder::new_thumb2();
8941        let op = ArmOp::I64Str {
8942            rdlo: Reg::R0,
8943            rdhi: Reg::R1,
8944            addr: MemAddr::imm(Reg::SP, 0),
8945        };
8946        let code = encoder.encode(&op).unwrap();
8947        // Two STR instructions (lo at offset, hi at offset+4)
8948        assert!(code.len() >= 4, "I64Str should emit at least 4 bytes");
8949    }
8950
8951    #[test]
8952    fn test_encode_i64_all_comparisons_thumb2() {
8953        let encoder = ArmEncoder::new_thumb2();
8954
8955        let ops = vec![
8956            ArmOp::I64Ne {
8957                rd: Reg::R0,
8958                rnlo: Reg::R0,
8959                rnhi: Reg::R1,
8960                rmlo: Reg::R2,
8961                rmhi: Reg::R3,
8962            },
8963            ArmOp::I64LtS {
8964                rd: Reg::R0,
8965                rnlo: Reg::R0,
8966                rnhi: Reg::R1,
8967                rmlo: Reg::R2,
8968                rmhi: Reg::R3,
8969            },
8970            ArmOp::I64LtU {
8971                rd: Reg::R0,
8972                rnlo: Reg::R0,
8973                rnhi: Reg::R1,
8974                rmlo: Reg::R2,
8975                rmhi: Reg::R3,
8976            },
8977            ArmOp::I64LeS {
8978                rd: Reg::R0,
8979                rnlo: Reg::R0,
8980                rnhi: Reg::R1,
8981                rmlo: Reg::R2,
8982                rmhi: Reg::R3,
8983            },
8984            ArmOp::I64LeU {
8985                rd: Reg::R0,
8986                rnlo: Reg::R0,
8987                rnhi: Reg::R1,
8988                rmlo: Reg::R2,
8989                rmhi: Reg::R3,
8990            },
8991            ArmOp::I64GtS {
8992                rd: Reg::R0,
8993                rnlo: Reg::R0,
8994                rnhi: Reg::R1,
8995                rmlo: Reg::R2,
8996                rmhi: Reg::R3,
8997            },
8998            ArmOp::I64GtU {
8999                rd: Reg::R0,
9000                rnlo: Reg::R0,
9001                rnhi: Reg::R1,
9002                rmlo: Reg::R2,
9003                rmhi: Reg::R3,
9004            },
9005            ArmOp::I64GeS {
9006                rd: Reg::R0,
9007                rnlo: Reg::R0,
9008                rnhi: Reg::R1,
9009                rmlo: Reg::R2,
9010                rmhi: Reg::R3,
9011            },
9012            ArmOp::I64GeU {
9013                rd: Reg::R0,
9014                rnlo: Reg::R0,
9015                rnhi: Reg::R1,
9016                rmlo: Reg::R2,
9017                rmhi: Reg::R3,
9018            },
9019        ];
9020
9021        for op in &ops {
9022            let code = encoder.encode(op).unwrap();
9023            assert!(
9024                code.len() >= 8,
9025                "i64 comparison {:?} should emit at least 8 bytes, got {}",
9026                op,
9027                code.len()
9028            );
9029        }
9030    }
9031
9032    #[test]
9033    fn test_encode_i64_const_zero_thumb2() {
9034        let encoder = ArmEncoder::new_thumb2();
9035        let op = ArmOp::I64Const {
9036            rdlo: Reg::R0,
9037            rdhi: Reg::R1,
9038            value: 0,
9039        };
9040        let code = encoder.encode(&op).unwrap();
9041        // MOVW R0, #0 (4 bytes) + MOVW R1, #0 (4 bytes) = 8 bytes
9042        assert_eq!(code.len(), 8, "I64Const(0) should be 8 bytes");
9043    }
9044
9045    #[test]
9046    fn test_encode_i64_const_negative_one_thumb2() {
9047        let encoder = ArmEncoder::new_thumb2();
9048        let op = ArmOp::I64Const {
9049            rdlo: Reg::R0,
9050            rdhi: Reg::R1,
9051            value: -1, // 0xFFFF_FFFF_FFFF_FFFF
9052        };
9053        let code = encoder.encode(&op).unwrap();
9054        // MOVW + MOVT for lo (8 bytes) + MOVW + MOVT for hi (8 bytes) = 16 bytes
9055        assert_eq!(code.len(), 16, "I64Const(-1) should be 16 bytes");
9056    }
9057
9058    // =========================================================================
9059    // Sub-word load/store encoding tests
9060    // =========================================================================
9061
9062    #[test]
9063    fn test_encode_ldrb_arm32() {
9064        let encoder = ArmEncoder::new_arm32();
9065        let op = ArmOp::Ldrb {
9066            rd: Reg::R0,
9067            addr: MemAddr::imm(Reg::R1, 4),
9068        };
9069        let code = encoder.encode(&op).unwrap();
9070        assert_eq!(code.len(), 4, "ARM32 LDRB should be 4 bytes");
9071        // LDRB R0, [R1, #4] = 0xE5D10004
9072        let encoded = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
9073        assert_eq!(encoded, 0xE5D10004, "Should encode LDRB R0, [R1, #4]");
9074    }
9075
9076    #[test]
9077    fn test_encode_strb_arm32() {
9078        let encoder = ArmEncoder::new_arm32();
9079        let op = ArmOp::Strb {
9080            rd: Reg::R0,
9081            addr: MemAddr::imm(Reg::R1, 0),
9082        };
9083        let code = encoder.encode(&op).unwrap();
9084        assert_eq!(code.len(), 4, "ARM32 STRB should be 4 bytes");
9085        // STRB R0, [R1, #0] = 0xE5C10000
9086        let encoded = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
9087        assert_eq!(encoded, 0xE5C10000, "Should encode STRB R0, [R1, #0]");
9088    }
9089
9090    #[test]
9091    fn test_encode_ldrh_arm32() {
9092        let encoder = ArmEncoder::new_arm32();
9093        let op = ArmOp::Ldrh {
9094            rd: Reg::R0,
9095            addr: MemAddr::imm(Reg::R1, 2),
9096        };
9097        let code = encoder.encode(&op).unwrap();
9098        assert_eq!(code.len(), 4, "ARM32 LDRH should be 4 bytes");
9099    }
9100
9101    #[test]
9102    fn test_encode_strh_arm32() {
9103        let encoder = ArmEncoder::new_arm32();
9104        let op = ArmOp::Strh {
9105            rd: Reg::R0,
9106            addr: MemAddr::imm(Reg::R1, 0),
9107        };
9108        let code = encoder.encode(&op).unwrap();
9109        assert_eq!(code.len(), 4, "ARM32 STRH should be 4 bytes");
9110    }
9111
9112    #[test]
9113    fn test_encode_ldrsb_arm32() {
9114        let encoder = ArmEncoder::new_arm32();
9115        let op = ArmOp::Ldrsb {
9116            rd: Reg::R0,
9117            addr: MemAddr::imm(Reg::R1, 0),
9118        };
9119        let code = encoder.encode(&op).unwrap();
9120        assert_eq!(code.len(), 4, "ARM32 LDRSB should be 4 bytes");
9121    }
9122
9123    #[test]
9124    fn test_encode_ldrsh_arm32() {
9125        let encoder = ArmEncoder::new_arm32();
9126        let op = ArmOp::Ldrsh {
9127            rd: Reg::R0,
9128            addr: MemAddr::imm(Reg::R1, 0),
9129        };
9130        let code = encoder.encode(&op).unwrap();
9131        assert_eq!(code.len(), 4, "ARM32 LDRSH should be 4 bytes");
9132    }
9133
9134    #[test]
9135    fn test_encode_ldrb_thumb2_16bit() {
9136        let encoder = ArmEncoder::new_thumb2();
9137        let op = ArmOp::Ldrb {
9138            rd: Reg::R0,
9139            addr: MemAddr::imm(Reg::R1, 4),
9140        };
9141        let code = encoder.encode(&op).unwrap();
9142        // Low registers + small offset -> 16-bit encoding
9143        assert_eq!(
9144            code.len(),
9145            2,
9146            "Thumb-2 LDRB with small offset should be 16-bit"
9147        );
9148    }
9149
9150    #[test]
9151    fn test_encode_ldrb_thumb2_32bit() {
9152        let encoder = ArmEncoder::new_thumb2();
9153        let op = ArmOp::Ldrb {
9154            rd: Reg::R0,
9155            addr: MemAddr::imm(Reg::R1, 100), // offset > 31 needs 32-bit
9156        };
9157        let code = encoder.encode(&op).unwrap();
9158        assert_eq!(
9159            code.len(),
9160            4,
9161            "Thumb-2 LDRB with large offset should be 32-bit"
9162        );
9163    }
9164
9165    #[test]
9166    fn test_encode_strb_thumb2_16bit() {
9167        let encoder = ArmEncoder::new_thumb2();
9168        let op = ArmOp::Strb {
9169            rd: Reg::R0,
9170            addr: MemAddr::imm(Reg::R1, 10),
9171        };
9172        let code = encoder.encode(&op).unwrap();
9173        assert_eq!(
9174            code.len(),
9175            2,
9176            "Thumb-2 STRB with small offset should be 16-bit"
9177        );
9178    }
9179
9180    #[test]
9181    fn test_encode_ldrh_thumb2_16bit() {
9182        let encoder = ArmEncoder::new_thumb2();
9183        let op = ArmOp::Ldrh {
9184            rd: Reg::R0,
9185            addr: MemAddr::imm(Reg::R1, 4), // offset aligned to 2, <= 62
9186        };
9187        let code = encoder.encode(&op).unwrap();
9188        assert_eq!(
9189            code.len(),
9190            2,
9191            "Thumb-2 LDRH with small aligned offset should be 16-bit"
9192        );
9193    }
9194
9195    #[test]
9196    fn test_encode_strh_thumb2_16bit() {
9197        let encoder = ArmEncoder::new_thumb2();
9198        let op = ArmOp::Strh {
9199            rd: Reg::R0,
9200            addr: MemAddr::imm(Reg::R1, 4),
9201        };
9202        let code = encoder.encode(&op).unwrap();
9203        assert_eq!(
9204            code.len(),
9205            2,
9206            "Thumb-2 STRH with small aligned offset should be 16-bit"
9207        );
9208    }
9209
9210    #[test]
9211    fn test_encode_ldrsb_thumb2() {
9212        let encoder = ArmEncoder::new_thumb2();
9213        let op = ArmOp::Ldrsb {
9214            rd: Reg::R0,
9215            addr: MemAddr::imm(Reg::R1, 0),
9216        };
9217        let code = encoder.encode(&op).unwrap();
9218        // LDRSB has no 16-bit immediate form, always 32-bit
9219        assert_eq!(code.len(), 4, "Thumb-2 LDRSB should be 32-bit");
9220    }
9221
9222    #[test]
9223    fn test_encode_ldrsh_thumb2() {
9224        let encoder = ArmEncoder::new_thumb2();
9225        let op = ArmOp::Ldrsh {
9226            rd: Reg::R0,
9227            addr: MemAddr::imm(Reg::R1, 0),
9228        };
9229        let code = encoder.encode(&op).unwrap();
9230        assert_eq!(code.len(), 4, "Thumb-2 LDRSH should be 32-bit");
9231    }
9232
9233    #[test]
9234    fn test_encode_memory_size_thumb2() {
9235        let encoder = ArmEncoder::new_thumb2();
9236        let op = ArmOp::MemorySize { rd: Reg::R0 };
9237        let code = encoder.encode(&op).unwrap();
9238        // R0 and R10 are not both low registers, so this needs careful handling
9239        assert!(!code.is_empty(), "MemorySize should produce code");
9240    }
9241
9242    #[test]
9243    fn test_encode_memory_grow_thumb2() {
9244        let encoder = ArmEncoder::new_thumb2();
9245        let op = ArmOp::MemoryGrow {
9246            rd: Reg::R0,
9247            rn: Reg::R0,
9248        };
9249        let code = encoder.encode(&op).unwrap();
9250        assert_eq!(code.len(), 4, "MemoryGrow (MVN) should be 32-bit Thumb-2");
9251    }
9252
9253    #[test]
9254    fn test_encode_subword_reg_offset_thumb2() {
9255        let encoder = ArmEncoder::new_thumb2();
9256
9257        // LDRB with register offset
9258        let op = ArmOp::Ldrb {
9259            rd: Reg::R0,
9260            addr: MemAddr::reg(Reg::R1, Reg::R2),
9261        };
9262        let code = encoder.encode(&op).unwrap();
9263        assert_eq!(
9264            code.len(),
9265            4,
9266            "Thumb-2 LDRB with reg offset should be 32-bit"
9267        );
9268
9269        // STRB with register offset
9270        let op = ArmOp::Strb {
9271            rd: Reg::R0,
9272            addr: MemAddr::reg(Reg::R1, Reg::R2),
9273        };
9274        let code = encoder.encode(&op).unwrap();
9275        assert_eq!(
9276            code.len(),
9277            4,
9278            "Thumb-2 STRB with reg offset should be 32-bit"
9279        );
9280
9281        // LDRH with register offset
9282        let op = ArmOp::Ldrh {
9283            rd: Reg::R0,
9284            addr: MemAddr::reg(Reg::R1, Reg::R2),
9285        };
9286        let code = encoder.encode(&op).unwrap();
9287        assert_eq!(
9288            code.len(),
9289            4,
9290            "Thumb-2 LDRH with reg offset should be 32-bit"
9291        );
9292
9293        // STRH with register offset
9294        let op = ArmOp::Strh {
9295            rd: Reg::R0,
9296            addr: MemAddr::reg(Reg::R1, Reg::R2),
9297        };
9298        let code = encoder.encode(&op).unwrap();
9299        assert_eq!(
9300            code.len(),
9301            4,
9302            "Thumb-2 STRH with reg offset should be 32-bit"
9303        );
9304    }
9305
9306    #[test]
9307    fn test_encode_subword_reg_imm_offset_thumb2() {
9308        let encoder = ArmEncoder::new_thumb2();
9309
9310        // LDRB with both register and immediate offset
9311        let op = ArmOp::Ldrb {
9312            rd: Reg::R0,
9313            addr: MemAddr::reg_imm(Reg::R1, Reg::R2, 4),
9314        };
9315        let code = encoder.encode(&op).unwrap();
9316        // ADD R12, R2, #4 (4 bytes) + LDRB R0, [R1, R12] (4 bytes) = 8 bytes
9317        assert_eq!(
9318            code.len(),
9319            8,
9320            "Thumb-2 LDRB with reg+imm offset should be 8 bytes"
9321        );
9322    }
9323
9324    // ========================================================================
9325    // Helium MVE encoding tests
9326    // ========================================================================
9327
9328    #[test]
9329    fn test_encode_mve_addi32_thumb2() {
9330        let encoder = ArmEncoder::new_thumb2();
9331        let op = ArmOp::MveAddI {
9332            qd: QReg::Q0,
9333            qn: QReg::Q1,
9334            qm: QReg::Q2,
9335            size: MveSize::S32,
9336        };
9337        let code = encoder.encode(&op).unwrap();
9338        assert_eq!(
9339            code.len(),
9340            4,
9341            "MVE VADD.I32 should be 4 bytes (Thumb-2 32-bit)"
9342        );
9343    }
9344
9345    #[test]
9346    fn test_encode_mve_subi16_thumb2() {
9347        let encoder = ArmEncoder::new_thumb2();
9348        let op = ArmOp::MveSubI {
9349            qd: QReg::Q0,
9350            qn: QReg::Q1,
9351            qm: QReg::Q2,
9352            size: MveSize::S16,
9353        };
9354        let code = encoder.encode(&op).unwrap();
9355        assert_eq!(code.len(), 4, "MVE VSUB.I16 should be 4 bytes");
9356    }
9357
9358    #[test]
9359    fn test_encode_mve_muli8_thumb2() {
9360        let encoder = ArmEncoder::new_thumb2();
9361        let op = ArmOp::MveMulI {
9362            qd: QReg::Q0,
9363            qn: QReg::Q1,
9364            qm: QReg::Q2,
9365            size: MveSize::S8,
9366        };
9367        let code = encoder.encode(&op).unwrap();
9368        assert_eq!(code.len(), 4, "MVE VMUL.I8 should be 4 bytes");
9369    }
9370
9371    #[test]
9372    fn test_encode_mve_bitwise_thumb2() {
9373        let encoder = ArmEncoder::new_thumb2();
9374
9375        let ops = vec![
9376            ArmOp::MveAnd {
9377                qd: QReg::Q0,
9378                qn: QReg::Q1,
9379                qm: QReg::Q2,
9380            },
9381            ArmOp::MveOrr {
9382                qd: QReg::Q0,
9383                qn: QReg::Q1,
9384                qm: QReg::Q2,
9385            },
9386            ArmOp::MveEor {
9387                qd: QReg::Q0,
9388                qn: QReg::Q1,
9389                qm: QReg::Q2,
9390            },
9391            ArmOp::MveBic {
9392                qd: QReg::Q0,
9393                qn: QReg::Q1,
9394                qm: QReg::Q2,
9395            },
9396        ];
9397        for op in ops {
9398            let code = encoder.encode(&op).unwrap();
9399            assert_eq!(code.len(), 4, "MVE bitwise op should be 4 bytes");
9400        }
9401    }
9402
9403    #[test]
9404    fn test_encode_mve_mvn_thumb2() {
9405        let encoder = ArmEncoder::new_thumb2();
9406        let op = ArmOp::MveMvn {
9407            qd: QReg::Q0,
9408            qm: QReg::Q1,
9409        };
9410        let code = encoder.encode(&op).unwrap();
9411        assert_eq!(code.len(), 4, "MVE VMVN should be 4 bytes");
9412    }
9413
9414    #[test]
9415    fn test_encode_mve_load_store_thumb2() {
9416        let encoder = ArmEncoder::new_thumb2();
9417
9418        let load = ArmOp::MveLoad {
9419            qd: QReg::Q0,
9420            addr: MemAddr::imm(Reg::R0, 16),
9421        };
9422        let code = encoder.encode(&load).unwrap();
9423        assert_eq!(code.len(), 4, "MVE VLDRW.32 should be 4 bytes");
9424
9425        let store = ArmOp::MveStore {
9426            qd: QReg::Q1,
9427            addr: MemAddr::imm(Reg::R1, 0),
9428        };
9429        let code = encoder.encode(&store).unwrap();
9430        assert_eq!(code.len(), 4, "MVE VSTRW.32 should be 4 bytes");
9431    }
9432
9433    #[test]
9434    fn test_encode_mve_const_thumb2() {
9435        let encoder = ArmEncoder::new_thumb2();
9436        let op = ArmOp::MveConst {
9437            qd: QReg::Q0,
9438            bytes: [1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0],
9439        };
9440        let code = encoder.encode(&op).unwrap();
9441        // Should be 4 words of (MOVW R12 + VMOV Sn) = 4 * (4+4) = 32 bytes min
9442        // Some words with hi16=0 skip MOVT, so length varies
9443        assert!(
9444            code.len() >= 24,
9445            "MVE const should produce multiple instructions"
9446        );
9447    }
9448
9449    #[test]
9450    fn test_encode_mve_dup_thumb2() {
9451        let encoder = ArmEncoder::new_thumb2();
9452        let op = ArmOp::MveDup {
9453            qd: QReg::Q0,
9454            rn: Reg::R0,
9455            size: MveSize::S32,
9456        };
9457        let code = encoder.encode(&op).unwrap();
9458        assert_eq!(code.len(), 4, "MVE VDUP.32 should be 4 bytes");
9459    }
9460
9461    #[test]
9462    fn test_encode_mve_extract_lane_thumb2() {
9463        let encoder = ArmEncoder::new_thumb2();
9464        let op = ArmOp::MveExtractLane {
9465            rd: Reg::R0,
9466            qn: QReg::Q1,
9467            lane: 2,
9468            size: MveSize::S32,
9469        };
9470        let code = encoder.encode(&op).unwrap();
9471        assert_eq!(code.len(), 4, "MVE extract lane should be 4 bytes");
9472    }
9473
9474    #[test]
9475    fn test_encode_mve_insert_lane_thumb2() {
9476        let encoder = ArmEncoder::new_thumb2();
9477        let op = ArmOp::MveInsertLane {
9478            qd: QReg::Q0,
9479            rn: Reg::R1,
9480            lane: 3,
9481            size: MveSize::S32,
9482        };
9483        let code = encoder.encode(&op).unwrap();
9484        assert_eq!(code.len(), 4, "MVE insert lane should be 4 bytes");
9485    }
9486
9487    #[test]
9488    fn test_encode_mve_addf32_thumb2() {
9489        let encoder = ArmEncoder::new_thumb2();
9490        let op = ArmOp::MveAddF32 {
9491            qd: QReg::Q0,
9492            qn: QReg::Q1,
9493            qm: QReg::Q2,
9494        };
9495        let code = encoder.encode(&op).unwrap();
9496        assert_eq!(code.len(), 4, "MVE VADD.F32 should be 4 bytes");
9497    }
9498
9499    #[test]
9500    fn test_encode_mve_divf32_thumb2() {
9501        let encoder = ArmEncoder::new_thumb2();
9502        let op = ArmOp::MveDivF32 {
9503            qd: QReg::Q0,
9504            qn: QReg::Q1,
9505            qm: QReg::Q2,
9506        };
9507        let code = encoder.encode(&op).unwrap();
9508        // Lane-wise: 4 x VDIV.F32 = 4 x 4 = 16 bytes
9509        assert_eq!(
9510            code.len(),
9511            16,
9512            "MVE VDIV.F32 (lane-wise) should be 16 bytes"
9513        );
9514    }
9515
9516    #[test]
9517    fn test_encode_mve_sqrtf32_thumb2() {
9518        let encoder = ArmEncoder::new_thumb2();
9519        let op = ArmOp::MveSqrtF32 {
9520            qd: QReg::Q0,
9521            qm: QReg::Q1,
9522        };
9523        let code = encoder.encode(&op).unwrap();
9524        // Lane-wise: 4 x VSQRT.F32 = 4 x 4 = 16 bytes
9525        assert_eq!(
9526            code.len(),
9527            16,
9528            "MVE VSQRT.F32 (lane-wise) should be 16 bytes"
9529        );
9530    }
9531
9532    #[test]
9533    fn test_encode_mve_negf32_thumb2() {
9534        let encoder = ArmEncoder::new_thumb2();
9535        let op = ArmOp::MveNegF32 {
9536            qd: QReg::Q0,
9537            qm: QReg::Q1,
9538        };
9539        let code = encoder.encode(&op).unwrap();
9540        assert_eq!(code.len(), 4, "MVE VNEG.F32 should be 4 bytes");
9541    }
9542
9543    #[test]
9544    fn test_encode_mve_absf32_thumb2() {
9545        let encoder = ArmEncoder::new_thumb2();
9546        let op = ArmOp::MveAbsF32 {
9547            qd: QReg::Q0,
9548            qm: QReg::Q1,
9549        };
9550        let code = encoder.encode(&op).unwrap();
9551        assert_eq!(code.len(), 4, "MVE VABS.F32 should be 4 bytes");
9552    }
9553
9554    /// VCR-RA-001 / immediate-folding precondition: pins the Thumb-2 `AND`
9555    /// immediate encoding for the byte range and documents its bound.
9556    ///
9557    /// The `And { Operand2::Imm }` encoder packs the low 12 bits straight into
9558    /// the `i:imm3:imm8` field WITHOUT applying ThumbExpandImm (the modified-
9559    /// immediate expansion). For `imm <= 0xFF` (e.g. gale's int8 clamps
9560    /// `#0x7e` / `#0x7f`) that is correct — `i:imm3 = 0000` means "imm8
9561    /// zero-extended". So `and r2, r0, #0x7e` encodes to the canonical
9562    /// `00 f0 7e 02`. For `imm >= 0x100` the field would need a true
9563    /// ThumbExpandImm pattern (rotation / replication), which is NOT
9564    /// implemented here — so **immediate folding must gate on `imm <= 0xFF`**
9565    /// until the encoder is hardened to ThumbExpandImm/Ok-or-Err (the
9566    /// "encoder must be Ok-or-Err, never silently wrong" principle, #180/#185).
9567    /// This bound covers the measured `flat_flight` waste (#209).
9568    #[test]
9569    fn and_immediate_encodes_correctly_in_byte_range_documents_fold_bound() {
9570        let encoder = ArmEncoder::new_thumb2();
9571        let op = ArmOp::And {
9572            rd: Reg::R2,
9573            rn: Reg::R0,
9574            op2: Operand2::Imm(0x7e),
9575        };
9576        let code = encoder.encode(&op).unwrap();
9577        assert_eq!(
9578            code,
9579            vec![0x00, 0xf0, 0x7e, 0x02],
9580            "and r2, r0, #0x7e must encode to the canonical AND.W T1 (imm8=0x7e)"
9581        );
9582    }
9583
9584    /// #255: the shared ThumbExpandImm reverse-encoder underpinning the
9585    /// data-processing immediate fix. Encodable modified immediates round-trip to
9586    /// the expected `i:imm3:imm8` field; a genuinely non-modified value is `None`
9587    /// (caller must materialize into a register). Note `1000 = 0xFA ror 30` *is*
9588    /// representable (field 0xF7A) — the old encoder mis-encoded it (raw 0x3E8);
9589    /// this encodes it correctly.
9590    #[test]
9591    fn try_thumb_expand_imm_encodes_modified_immediates() {
9592        assert_eq!(try_thumb_expand_imm(0x7e), Some(0x07e)); // zero-extended byte
9593        assert_eq!(try_thumb_expand_imm(0xff), Some(0x0ff));
9594        assert_eq!(try_thumb_expand_imm(0x0001_0001), Some(0x101)); // 0x00XY00XY
9595        assert_eq!(try_thumb_expand_imm(0xff00_ff00), Some(0x2ff)); // 0xXY00XY00
9596        assert_eq!(try_thumb_expand_imm(0xffff_ffff), Some(0x3ff)); // 0xXYXYXYXY
9597        assert_eq!(try_thumb_expand_imm(0x100), Some(0xf80)); // 0x80 ror 31
9598        assert_eq!(try_thumb_expand_imm(0x8000_0000), Some(0x400)); // 0x80 ror 8
9599        assert_eq!(try_thumb_expand_imm(1000), Some(0xf7a)); // 0xFA ror 30
9600        // Genuinely unrepresentable (bits too far apart for an 8-bit window).
9601        assert_eq!(try_thumb_expand_imm(0x101), None);
9602        assert_eq!(try_thumb_expand_imm(0x12345), None);
9603    }
9604
9605    /// #255: CMP/ADDS/SUBS encode any valid modified immediate correctly, and
9606    /// ERROR (not silently mis-encode) on a genuinely unrepresentable one,
9607    /// forcing the selector to materialize into a register — closing the
9608    /// silent-miscompile class of #251/#253.
9609    #[test]
9610    fn cmp_adds_subs_immediate_error_on_non_modified_imm() {
9611        let encoder = ArmEncoder::new_thumb2();
9612        // cmp r0, #0xff → valid → Ok; cmp r0, #1000 → valid (0xFA ror 30) → Ok.
9613        assert!(encoder.encode_thumb32_cmp_imm(&Reg::R0, 0xff).is_ok());
9614        assert!(encoder.encode_thumb32_cmp_imm(&Reg::R0, 1000).is_ok());
9615        // cmp r0, #0x101 → NOT a modified immediate → Err (materialize-reg).
9616        assert!(
9617            encoder.encode_thumb32_cmp_imm(&Reg::R0, 0x101).is_err(),
9618            "cmp #0x101 must error, not compare the wrong constant"
9619        );
9620        assert!(
9621            encoder
9622                .encode_thumb32_adds(&Reg::R0, &Reg::R0, 0x101)
9623                .is_err()
9624        );
9625        assert!(
9626            encoder
9627                .encode_thumb32_subs(&Reg::R0, &Reg::R0, 0x101)
9628                .is_err()
9629        );
9630        // ...but a valid modified immediate still encodes.
9631        assert!(
9632            encoder
9633                .encode_thumb32_adds(&Reg::R0, &Reg::R0, 0x80)
9634                .is_ok()
9635        );
9636    }
9637
9638    /// #257: MLA (multiply-accumulate) encodes as MLS without the bit-4 op flag.
9639    /// `mla r2, r3, r4, r8` (rd=r2, rn=r3, rm=r4, ra=r8) → Thumb-2 `03 fb 04 82`.
9640    #[test]
9641    fn mla_thumb2_encodes_correctly() {
9642        let encoder = ArmEncoder::new_thumb2();
9643        let code = encoder
9644            .encode(&ArmOp::Mla {
9645                rd: Reg::R2,
9646                rn: Reg::R3,
9647                rm: Reg::R4,
9648                ra: Reg::R8,
9649            })
9650            .unwrap();
9651        // hw1 = 0xFB03, hw2 = (8<<12)|(2<<8)|4 = 0x8204
9652        assert_eq!(code, vec![0x03, 0xfb, 0x04, 0x82]);
9653    }
9654
9655    /// #259: LDR/STR (and sub-word) immediate-offset encoders truncated
9656    /// `offset & 0xFFF`, silently targeting the wrong address for offset >= 4096.
9657    /// They now error (the selector must use register-offset addressing) — the
9658    /// load/store sibling of the #253/#255 class. Offsets <= 4095 still encode.
9659    #[test]
9660    fn ldst_imm12_offset_errors_when_out_of_range() {
9661        let encoder = ArmEncoder::new_thumb2();
9662        // offset 0xFFF (4095): valid → Ok; ldr r0, [r1, #4095].
9663        assert!(
9664            encoder
9665                .encode_thumb32_ldr(&Reg::R0, &Reg::R1, 0xFFF)
9666                .is_ok()
9667        );
9668        // offset 0x1000 (4096): out of imm12 range → Err (not & 0xFFF → #0).
9669        assert!(
9670            encoder
9671                .encode_thumb32_ldr(&Reg::R0, &Reg::R1, 0x1000)
9672                .is_err(),
9673            "ldr offset 4096 must error, not wrap to 0"
9674        );
9675        assert!(
9676            encoder
9677                .encode_thumb32_str(&Reg::R0, &Reg::R1, 0x1000)
9678                .is_err()
9679        );
9680        assert!(
9681            encoder
9682                .encode_thumb32_ldrb_imm(&Reg::R0, &Reg::R1, 5000)
9683                .is_err()
9684        );
9685        assert!(
9686            encoder
9687                .encode_thumb32_strh_imm(&Reg::R0, &Reg::R1, 5000)
9688                .is_err()
9689        );
9690    }
9691
9692    /// Latent miscompile fix: ADD/SUB with a >0xFF immediate (e.g.
9693    /// `add sp, sp, #frame` for a >=256-byte frame) used ADD.W (T3), whose
9694    /// `i:imm3:imm8` is a ThumbExpandImm modified immediate — so `#256` silently
9695    /// encoded as `#0` (stack corruption). Use ADDW/SUBW (T4), a PLAIN 12-bit
9696    /// immediate, for 0x100..=0xFFF; keep T3 for <=0xFF (bit-identical); error
9697    /// beyond 4095.
9698    #[test]
9699    fn add_sub_large_immediate_use_addw_subw_not_misencoded() {
9700        let encoder = ArmEncoder::new_thumb2();
9701        // add sp, sp, #256  →  ADDW (T4) SP, SP, #256  =  0d f2 00 1d
9702        assert_eq!(
9703            encoder
9704                .encode(&ArmOp::Add {
9705                    rd: Reg::SP,
9706                    rn: Reg::SP,
9707                    op2: Operand2::Imm(256),
9708                })
9709                .unwrap(),
9710            vec![0x0d, 0xf2, 0x00, 0x1d],
9711            "add sp,sp,#256 must be ADDW (plain imm12), not a mis-encoded ADD.W"
9712        );
9713        // sub sp, sp, #256  →  SUBW (T4) SP, SP, #256  =  ad f2 00 1d
9714        assert_eq!(
9715            encoder
9716                .encode(&ArmOp::Sub {
9717                    rd: Reg::SP,
9718                    rn: Reg::SP,
9719                    op2: Operand2::Imm(256),
9720                })
9721                .unwrap(),
9722            vec![0xad, 0xf2, 0x00, 0x1d],
9723        );
9724        // > 4095 has no single-instruction encoding → error, not silent wrong.
9725        assert!(
9726            encoder
9727                .encode(&ArmOp::Add {
9728                    rd: Reg::SP,
9729                    rn: Reg::SP,
9730                    op2: Operand2::Imm(5000),
9731                })
9732                .is_err(),
9733            "add #5000 must error (no single ADDW), not mis-encode"
9734        );
9735    }
9736
9737    /// Closes the data-proc immediate class: AND and CMN now go through
9738    /// `try_thumb_expand_imm` like ORR/EOR/CMP — correct for any modified
9739    /// immediate, `Err` (not raw-pack / NOP) on an un-encodable one. The byte
9740    /// range stays bit-identical (`and r2,r0,#0x7e` is unchanged).
9741    #[test]
9742    fn and_cmn_immediate_thumb_expand_else_error() {
9743        let encoder = ArmEncoder::new_thumb2();
9744        // byte range unchanged (bit-identical with the pre-retrofit encoding)
9745        assert_eq!(
9746            encoder
9747                .encode(&ArmOp::And {
9748                    rd: Reg::R2,
9749                    rn: Reg::R0,
9750                    op2: Operand2::Imm(0x7e),
9751                })
9752                .unwrap(),
9753            vec![0x00, 0xf0, 0x7e, 0x02],
9754        );
9755        // a valid replicated modified immediate now encodes (was silently wrong)
9756        assert!(
9757            encoder
9758                .encode(&ArmOp::And {
9759                    rd: Reg::R2,
9760                    rn: Reg::R0,
9761                    op2: Operand2::Imm(0xff00ff00u32 as i32),
9762                })
9763                .is_ok()
9764        );
9765        // a genuinely un-encodable immediate errors (AND was raw-pack; CMN NOP)
9766        assert!(
9767            encoder
9768                .encode(&ArmOp::And {
9769                    rd: Reg::R2,
9770                    rn: Reg::R0,
9771                    op2: Operand2::Imm(0x101),
9772                })
9773                .is_err()
9774        );
9775        assert!(
9776            encoder
9777                .encode(&ArmOp::Cmn {
9778                    rn: Reg::R0,
9779                    op2: Operand2::Imm(0x101),
9780                })
9781                .is_err(),
9782            "CMN #0x101 must error, not emit a NOP"
9783        );
9784    }
9785
9786    /// VCR-RA-001: ORR/EOR with a small immediate must encode the real
9787    /// instruction (not a silent `0xBF00` NOP). Pins the byte range and the
9788    /// Ok-or-Err bound that makes future Or/Eor immediate folding safe.
9789    #[test]
9790    fn orr_eor_immediate_encode_in_byte_range_else_error() {
9791        let encoder = ArmEncoder::new_thumb2();
9792        // orr r2, r0, #0x7e  →  ORR.W T1, imm8=0x7e
9793        assert_eq!(
9794            encoder
9795                .encode(&ArmOp::Orr {
9796                    rd: Reg::R2,
9797                    rn: Reg::R0,
9798                    op2: Operand2::Imm(0x7e),
9799                })
9800                .unwrap(),
9801            vec![0x40, 0xf0, 0x7e, 0x02],
9802        );
9803        // eor r2, r0, #0x7e  →  EOR.W T1, imm8=0x7e
9804        assert_eq!(
9805            encoder
9806                .encode(&ArmOp::Eor {
9807                    rd: Reg::R2,
9808                    rn: Reg::R0,
9809                    op2: Operand2::Imm(0x7e),
9810                })
9811                .unwrap(),
9812            vec![0x80, 0xf0, 0x7e, 0x02],
9813        );
9814        // Out-of-range immediates error rather than silently mis-encode / NOP.
9815        assert!(
9816            encoder
9817                .encode(&ArmOp::Orr {
9818                    rd: Reg::R2,
9819                    rn: Reg::R0,
9820                    op2: Operand2::Imm(0x140),
9821                })
9822                .is_err(),
9823            "ORR #0x140 must error, not emit a NOP"
9824        );
9825    }
9826
9827    #[test]
9828    fn test_encode_mve_different_qregs() {
9829        let encoder = ArmEncoder::new_thumb2();
9830
9831        // Test that different Q-register numbers produce different encodings
9832        let op1 = ArmOp::MveAddI {
9833            qd: QReg::Q0,
9834            qn: QReg::Q0,
9835            qm: QReg::Q0,
9836            size: MveSize::S32,
9837        };
9838        let op2 = ArmOp::MveAddI {
9839            qd: QReg::Q3,
9840            qn: QReg::Q5,
9841            qm: QReg::Q7,
9842            size: MveSize::S32,
9843        };
9844        let code1 = encoder.encode(&op1).unwrap();
9845        let code2 = encoder.encode(&op2).unwrap();
9846        assert_ne!(
9847            code1, code2,
9848            "Different Q-registers should produce different encodings"
9849        );
9850    }
9851
9852    #[test]
9853    fn test_encode_mve_arm32_nop() {
9854        // MVE instructions on ARM32 encoder should produce NOP (only Thumb-2 supported)
9855        let encoder = ArmEncoder::new_arm32();
9856        let op = ArmOp::MveAddI {
9857            qd: QReg::Q0,
9858            qn: QReg::Q1,
9859            qm: QReg::Q2,
9860            size: MveSize::S32,
9861        };
9862        let code = encoder.encode(&op).unwrap();
9863        assert_eq!(code.len(), 4, "ARM32 MVE should be 4 bytes (NOP)");
9864        // NOP in ARM32 is 0xE1A00000 (MOV R0, R0)
9865        let instr = u32::from_le_bytes([code[0], code[1], code[2], code[3]]);
9866        assert_eq!(instr, 0xE1A00000, "ARM32 MVE should encode as NOP");
9867    }
9868}