Skip to main content

sp1_jit/backends/x86/
instruction_impl.rs

1#![allow(clippy::fn_to_numeric_cast)]
2
3use super::{TranspilerBackend, CONTEXT, MEMORY_PTR, PC_OFFSET, TEMP_A, TEMP_B};
4use crate::{
5    impl_alu32_imm_opt, impl_alu_imm_opt, impl_risc_alu, impl_shift32_imm_opt, ComputeInstructions,
6    ControlFlowInstructions, JitContext, MemoryInstructions, RiscOperand, RiscRegister,
7    RiscvTranspiler, SystemInstructions,
8};
9use dynasmrt::{dynasm, x64::Rq, DynasmApi, DynasmLabelApi};
10
11impl ComputeInstructions for TranspilerBackend {
12    fn add(&mut self, rd: RiscRegister, rs1: RiscOperand, rs2: RiscOperand) {
13        // lhs <- lhs + rhs (64-bit)
14        impl_alu_imm_opt!(self, rd, rs1, rs2, add);
15    }
16
17    fn mul(&mut self, rd: RiscRegister, rs1: RiscOperand, rs2: RiscOperand) {
18        // rd <- rs1 * rs2 (64-bit)
19        impl_risc_alu!(self, rd, rs1, rs2, TEMP_A, TEMP_B, {
20            dynasm! {
21                self;
22                .arch x64;
23                imul Rq(TEMP_A), Rq(TEMP_B)
24            }
25        })
26    }
27
28    fn and(&mut self, rd: RiscRegister, rs1: RiscOperand, rs2: RiscOperand) {
29        // rd <- rs1 & rs2 (64-bit)
30        impl_alu_imm_opt!(self, rd, rs1, rs2, and);
31    }
32
33    fn or(&mut self, rd: RiscRegister, rs1: RiscOperand, rs2: RiscOperand) {
34        // rd <- rs1 | rs2 (64-bit)
35        impl_alu_imm_opt!(self, rd, rs1, rs2, or);
36    }
37
38    fn xor(&mut self, rd: RiscRegister, rs1: RiscOperand, rs2: RiscOperand) {
39        // rd <- rs1 ^ rs2 (64-bit)
40        impl_alu_imm_opt!(self, rd, rs1, rs2, xor);
41    }
42
43    fn div(&mut self, rd: RiscRegister, rs1: RiscOperand, rs2: RiscOperand) {
44        // X86 uses [RAX::RDX] for the 64-bit divide operation.
45        // So we need to sign extend the lhs into RDX.
46        //
47        // The quotient is stored in RAX, and the remainder is stored in RDX.
48        //
49        // We can just write the quotient back into lhs, and the remainder is discarded.
50        self.emit_risc_operand_load(rs1, Rq::RAX as u8); // Load dividend directly into RAX
51        self.emit_risc_operand_load(rs2, TEMP_B);
52        dynasm! {
53            self;
54            .arch x64;
55
56            // ------------------------------------
57            // 1. Skip fault on div-by-zero
58            // ------------------------------------
59            test Rq(TEMP_B), Rq(TEMP_B);  // ZF=1 if rhs == 0
60            jz   >div_by_zero;
61
62            // Check for signed overflow (i64::MIN / -1)
63            mov  rcx, -9223372036854775808;  // i64::MIN
64            cmp  rax, rcx;
65            jne  >no_overflow;
66            cmp  Rq(TEMP_B), -1;
67            jne  >no_overflow;
68
69            // ------------------------------------
70            // 2. Handle overflow: i64::MIN / -1 = i64::MIN (wrapping)
71            // ------------------------------------
72            mov  rax, -9223372036854775808; // Result is i64::MIN
73            jmp >done;
74
75            no_overflow:;
76            // ------------------------------------
77            // 3. Perform signed divide
78            // ------------------------------------
79            // dividend already in RAX (loaded directly)
80            cqo;                          // sign-extend RAX into RDX (64-bit)
81            idiv Rq(TEMP_B);              // quotient → RAX, remainder → RDX
82            // quotient already in RAX
83            jmp >done;
84
85            // ------------------------------------
86            // 4. if rhs == 0
87            // ------------------------------------
88            div_by_zero:;
89            mov  rax, -1;                 // quotient = -1 (RISC-V spec for signed div by zero)
90
91            done:
92        }
93        self.emit_risc_register_store(Rq::RAX as u8, None, rd);
94    }
95
96    fn divu(&mut self, rd: RiscRegister, rs1: RiscOperand, rs2: RiscOperand) {
97        // lhs <- lhs / rhs   (unsigned 64-bit; u64::MAX if rhs == 0)
98        // clobbers: RAX, RDX
99        self.emit_risc_operand_load(rs1, Rq::RAX as u8); // Load dividend directly into RAX
100        self.emit_risc_operand_load(rs2, TEMP_B);
101        dynasm! {
102            self;
103            .arch x64;
104
105            // ----- skip fault on div-by-zero -----
106            test Rq(TEMP_B), Rq(TEMP_B);   // ZF = 1 when rhs == 0
107            jz   >div_by_zero;
108
109            // ----- perform unsigned divide -----
110            // dividend already in RAX (loaded directly)
111            xor  rdx, rdx;                 // zero-extend: RDX = 0
112            div  Rq(TEMP_B);               // unsigned divide: RDX:RAX / rhs
113            // quotient already in RAX
114            jmp  >done;
115
116            // ----- rhs == 0 -----
117            div_by_zero:;
118            mov  rax, -1;                  // quotient = u64::MAX (0xFFFFFFFFFFFFFFFF)
119
120            done:
121        }
122        self.emit_risc_register_store(Rq::RAX as u8, None, rd);
123    }
124
125    fn mulh(&mut self, rd: RiscRegister, rs1: RiscOperand, rs2: RiscOperand) {
126        // Signed multiply high: returns upper 64 bits of rs1 * rs2
127        // x86 imul for high multiply requires RAX and produces result in RDX
128        self.emit_risc_operand_load(rs1, Rq::RAX as u8); // Load multiplicand directly into RAX
129        self.emit_risc_operand_load(rs2, TEMP_B);
130        dynasm! {
131            self;
132            .arch x64;
133
134            // multiplicand already in RAX (loaded directly)
135            imul Rq(TEMP_B)          // signed 64×64 → 128; high → RDX
136            // high 64 bits already in RDX
137        }
138        self.emit_risc_register_store(Rq::RDX as u8, None, rd);
139    }
140
141    fn mulhu(&mut self, rd: RiscRegister, rs1: RiscOperand, rs2: RiscOperand) {
142        // Unsigned multiply high: returns upper 64 bits of rs1 * rs2
143        // x86 mul for high multiply requires RAX and produces result in RDX
144        self.emit_risc_operand_load(rs1, Rq::RAX as u8); // Load multiplicand directly into RAX
145        self.emit_risc_operand_load(rs2, TEMP_B);
146        dynasm! {
147            self;
148            .arch x64;
149
150            // multiplicand already in RAX (loaded directly)
151            mul  Rq(TEMP_B)          // unsigned 64×64 → 128; high → RDX
152            // high 64 bits already in RDX
153        }
154        self.emit_risc_register_store(Rq::RDX as u8, None, rd);
155    }
156
157    fn mulhsu(&mut self, rd: RiscRegister, rs1: RiscOperand, rs2: RiscOperand) {
158        // Mixed multiply high: signed rs1 * unsigned rs2, returns upper 64 bits
159        self.emit_risc_operand_load(rs1, Rq::RAX as u8); // Load signed multiplicand directly into RAX
160        self.emit_risc_operand_load(rs2, TEMP_B);
161        dynasm! {
162            self;
163            .arch x64;
164
165            // ──────────────────────────────────────────────────────────────
166            // 1. Move the **signed** left-hand operand (`TEMP_A`) into RAX.
167            //    ✦ The x86-64 `mul` instruction always uses RAX as its implicit
168            //      64-bit source operand, so we must place `TEMP_A` there first.
169            // ──────────────────────────────────────────────────────────────
170            // multiplicand already in RAX (optimized load)
171
172            // ──────────────────────────────────────────────────────────────
173            // 2. Preserve a second copy of `TEMP_A` in RCX.
174            //    ✦ The upcoming `mul` clobbers both RAX and RDX, erasing any
175            //      trace of the original sign.  We save `TEMP_A` in RCX so that
176            //      we can later decide whether the fix-up for a *negative*
177            //      multiplicand is required.
178            // ──────────────────────────────────────────────────────────────
179            mov rcx, rax;
180
181            // ──────────────────────────────────────────────────────────────
182            // 3. Unsigned 64×64-bit multiply:
183            //    mul Rq(TEMP_B)
184            //    ✦ Computes  RDX:RAX = (unsigned)RAX × (unsigned)TEMP_B.
185            //      The high 64 bits of the 128-bit product land in RDX.
186            // ──────────────────────────────────────────────────────────────
187            mul Rq(TEMP_B);
188
189            // ──────────────────────────────────────────────────────────────
190            // 4. Determine whether the *original* `TEMP_A` was negative.
191            //    ✦ `test rcx, rcx` sets the sign flag from RCX (the saved `TEMP_A`).
192            //    ✦ If the sign flag is *clear* (`TEMP_A` ≥ 0), we can skip the
193            //      correction step because the high half already matches the
194            //      semantics of the RISC-V MULHSU instruction.
195            // ──────────────────────────────────────────────────────────────
196            test rcx, rcx;
197            jns >store_high;          // Jump if `TEMP_A` was non-negative.
198
199            // ──────────────────────────────────────────────────────────────
200            // 5. Fix-up for negative `TEMP_A` (signed × unsigned semantics):
201            //    ✦ For a negative multiplicand, the unsigned `mul` delivered a
202            //      product that is *2⁶⁴* too large in the high word.  Subtracting
203            //      `TEMP_B` from RDX removes that excess and yields the correct
204            //      signed-high result.
205            // ──────────────────────────────────────────────────────────────
206            sub rdx, Rq(TEMP_B);
207
208            // ──────────────────────────────────────────────────────────────
209            // 6. Write the corrected high 64 bits back to the destination
210            //    RISC register specified by `TEMP_A`.
211            // ──────────────────────────────────────────────────────────────
212            store_high:
213            // result already in RDX
214        }
215        self.emit_risc_register_store(Rq::RDX as u8, None, rd);
216    }
217
218    /// Signed remainder: `rd = rs1 % rs2`  
219    /// *RISC-V rule*: if `rs2 == 0`, the result must be **0** (no fault).
220    fn rem(&mut self, rd: RiscRegister, rs1: RiscOperand, rs2: RiscOperand) {
221        impl_risc_alu!(self, rd, rs1, rs2, TEMP_A, TEMP_B, {
222            dynasm! {
223                self;
224                .arch x64;
225
226                // ──────────────────────────────────────────────────────────────
227                // 0. Guard: if divisor is 0, skip the IDIV and return dividend
228                // ──────────────────────────────────────────────────────────────
229                test Rq(TEMP_B), Rq(TEMP_B);        // ZF = 1  ⇒  TEMP_B == 0
230                jz   >by_zero;                // jump to fix-up path
231
232                // ──────────────────────────────────────────────────────────────
233                // 1. Check for signed overflow (i64::MIN % -1)
234                // ──────────────────────────────────────────────────────────────
235                mov  rcx, -9223372036854775808; // Load i64::MIN into RCX
236                cmp  Rq(TEMP_A), rcx;             // Check if dividend == i64::MIN
237                jne  >no_overflow;
238                cmp  Rq(TEMP_B), -1;              // Check if divisor == -1
239                jne  >no_overflow;
240
241                // ──────────────────────────────────────────────────────────────
242                // Handle overflow: i64::MIN % -1 = 0 (wrapping)
243                // ──────────────────────────────────────────────────────────────
244                xor  Rq(TEMP_A), Rq(TEMP_A);        // TEMP_A = 0
245                jmp  >done;
246
247                no_overflow:;
248                // ──────────────────────────────────────────────────────────────
249                // 2. Prepare the **signed** 64-bit dividend in EDX:EAX
250                //    -------------------------------------------------
251                //    • EAX ← low 32 bits of TEMP_A
252                //    • CDQ  sign-extends EAX into EDX
253                //      → EDX:EAX now holds the two's-complement 64-bit value a
254                // ──────────────────────────────────────────────────────────────
255                mov  rax, Rq(TEMP_A);            // RAX = a  (signed 64-bit)
256                cqo;                          // RDX = sign(a)
257
258                // ──────────────────────────────────────────────────────────────
259                // 3. Signed divide:          a  /  b
260                //    -------------------------------------------------
261                //    • idiv r/m32   performs  (EDX:EAX) ÷ TEMP_B
262                //      – Quotient  → EAX   (ignored)
263                //      – Remainder → EDX   (what RISC-V REM returns)
264                // ──────────────────────────────────────────────────────────────
265                idiv Rq(TEMP_B);                 // signed divide
266
267                // ──────────────────────────────────────────────────────────────
268                // 4. Write the remainder (EDX) back to the destination register
269                // ──────────────────────────────────────────────────────────────
270                mov  Rq(TEMP_A), rdx;            // TEMP_A = remainder
271                jmp  >done;
272
273                // ──────────────────────────────────────────────────────────────
274                // Divisor == 0  →  result must be dividend (RISC-V spec)
275                // ──────────────────────────────────────────────────────────────
276                by_zero:;
277                // TEMP_A already contains the dividend, no change needed
278
279                done:
280            }
281        })
282    }
283
284    fn remu(&mut self, rd: RiscRegister, rs1: RiscOperand, rs2: RiscOperand) {
285        impl_risc_alu!(self, rd, rs1, rs2, TEMP_A, TEMP_B, {
286            dynasm! {
287                self;
288                .arch x64;
289
290                // ──────────────────────────────────────────────────────────────
291                // 0. Guard against /0 → result = dividend (TEMP_A)
292                // ──────────────────────────────────────────────────────────────
293                test Rq(TEMP_B), Rq(TEMP_B);
294                jz   >by_zero;
295
296                // ──────────────────────────────────────────────────────────────
297                // 1. Prepare the **unsigned** 128-bit dividend in RDX:RAX
298                //    -------------------------------------------------
299                //    • Zero-extend TEMP_A into RDX:RAX.
300                // ──────────────────────────────────────────────────────────────
301                mov  rax, Rq(TEMP_A);
302                xor  rdx, rdx;
303
304                // ──────────────────────────────────────────────────────────────
305                // 2. Unsigned divide:       a  /  b
306                //    -------------------------------------------------
307                //    • div r/m64   performs  (RDX:RAX) ÷ TEMP_B
308                //      – Quotient  → RAX   (unused)
309                //      – Remainder → RDX   (what RISC-V REMU wants)
310                // ──────────────────────────────────────────────────────────────
311                div  Rq(TEMP_B);
312
313                // ──────────────────────────────────────────────────────────────
314                // 3. Write the remainder back to the destination register.
315                // ──────────────────────────────────────────────────────────────
316                mov  Rq(TEMP_A), rdx;
317                jmp  >done;
318
319                // ──────────────────────────────────────────────────────────────
320                // Divisor == 0  →  result must be dividend (RISC-V spec)
321                // ──────────────────────────────────────────────────────────────
322                by_zero:;
323                // TEMP_A already contains the dividend, no change needed
324
325                done:
326            }
327        })
328    }
329
330    fn sll(&mut self, rd: RiscRegister, rs1: RiscOperand, rs2: RiscOperand) {
331        // We only can use the lower 6 bits for the shift count in 64-bit mode.
332        // In RV64I, this is also true!
333        //
334        // CL is an alias for the lower byte of RCX.
335        match rs2 {
336            RiscOperand::Immediate(imm) => {
337                self.emit_risc_operand_load(rs1, TEMP_A);
338                dynasm! {
339                    self;
340                    .arch x64;
341                    // Direct immediate shift (lower 6 bits automatically masked by x86)
342                    shl Rq(TEMP_A), (imm & 0x3F) as i8
343                }
344                self.emit_risc_register_store(TEMP_A, None, rd);
345            }
346            _ => {
347                self.emit_risc_operand_load(rs1, TEMP_A);
348                self.emit_risc_operand_load(rs2, Rq::RCX as u8);
349                dynasm! {
350                    self;
351                    .arch x64;
352                    // ──────────────────────────────────────────────────────────────
353                    // 1. Shift count is already in RCX (loaded directly).
354                    //    • Only the low 6 bits are used for 64-bit operands,
355                    //      which matches the RISC-V spec for RV64.
356                    // ──────────────────────────────────────────────────────────────
357
358                    // ──────────────────────────────────────────────────────────────
359                    // 2. Logical left shift:
360                    //      Rq(TEMP_A) ← Rq(TEMP_A) << (CL & 0x3F)
361                    //    • `shl` fills zeros from the right as it shifts left.
362                    // ──────────────────────────────────────────────────────────────
363                    shl  Rq(TEMP_A), cl         // variable-count shift
364                }
365                self.emit_risc_register_store(TEMP_A, None, rd);
366            }
367        }
368    }
369
370    fn sra(&mut self, rd: RiscRegister, rs1: RiscOperand, rs2: RiscOperand) {
371        match rs2 {
372            RiscOperand::Immediate(imm) => {
373                self.emit_risc_operand_load(rs1, TEMP_A);
374                dynasm! {
375                    self;
376                    .arch x64;
377                    // Direct immediate arithmetic right shift
378                    sar Rq(TEMP_A), (imm & 0x3F) as i8
379                }
380                self.emit_risc_register_store(TEMP_A, None, rd);
381            }
382            _ => {
383                self.emit_risc_operand_load(rs1, TEMP_A);
384                self.emit_risc_operand_load(rs2, Rq::RCX as u8);
385                dynasm! {
386                    self;
387                    .arch x64;
388                    // ──────────────────────────────────────────────────────────────
389                    // 1. Shift count is already in RCX (loaded directly).
390                    //    • Only the low 6 bits are used for 64-bit operands,
391                    //      which matches the RISC-V spec for RV64.
392                    // ──────────────────────────────────────────────────────────────
393
394                    // ──────────────────────────────────────────────────────────────
395                    // 2. Arithmetic right shift:
396                    //      Rq(TEMP_A) ← (signed)Rq(TEMP_A) >> (CL & 0x3F)
397                    //    • `sar` replicates the sign bit as it shifts, so
398                    //      negative values stay negative after the operation.
399                    // ──────────────────────────────────────────────────────────────
400                    sar  Rq(TEMP_A), cl         // variable-count shift
401                }
402                self.emit_risc_register_store(TEMP_A, None, rd);
403            }
404        }
405    }
406
407    fn srl(&mut self, rd: RiscRegister, rs1: RiscOperand, rs2: RiscOperand) {
408        match rs2 {
409            RiscOperand::Immediate(imm) => {
410                self.emit_risc_operand_load(rs1, TEMP_A);
411                dynasm! {
412                    self;
413                    .arch x64;
414                    // Direct immediate logical right shift
415                    shr Rq(TEMP_A), (imm & 0x3F) as i8
416                }
417                self.emit_risc_register_store(TEMP_A, None, rd);
418            }
419            _ => {
420                self.emit_risc_operand_load(rs1, TEMP_A);
421                self.emit_risc_operand_load(rs2, Rq::RCX as u8);
422                dynasm! {
423                    self;
424                    .arch x64;
425                    // ──────────────────────────────────────────────────────────────
426                    // 1. Shift count is already in RCX (loaded directly).
427                    // ──────────────────────────────────────────────────────────────
428
429                    // ──────────────────────────────────────────────────────────────
430                    // 2. Logical right shift:
431                    //      Rq(TEMP_A) ← (unsigned)Rq(TEMP_A) >> (CL & 0x3F)
432                    //    • `shr` always inserts zeros from the left, regardless
433                    //      of the operand's sign.
434                    // ──────────────────────────────────────────────────────────────
435                    shr  Rq(TEMP_A), cl
436                }
437                self.emit_risc_register_store(TEMP_A, None, rd);
438            }
439        }
440    }
441
442    fn slt(&mut self, rd: RiscRegister, rs1: RiscOperand, rs2: RiscOperand) {
443        match rs2 {
444            RiscOperand::Immediate(imm) => {
445                self.emit_risc_operand_load(rs1, TEMP_A);
446                dynasm! {
447                    self;
448                    .arch x64;
449
450                    cmp Rq(TEMP_A), imm;
451
452                    // ──────────────────────────────────────────────────────────────
453                    // 2. setl  r/m8
454                    //    • Writes   1  to the target byte if  (SF ≠ OF)
455                    //      which is the signed "less than" condition.
456                    //    • We store straight into the low-byte of TEMP_A —
457                    //      dynasm's `Rb()` gives us that alias.
458                    // ──────────────────────────────────────────────────────────────
459                    setl Rb(TEMP_A);               // byte = 1 if TEMP_A < imm (signed)
460
461                    // ──────────────────────────────────────────────────────────────
462                    // 3. Zero-extend that byte back to a full 32-bit register so
463                    //    that the RISC register ends up with 0x0000_0000 or 0x0000_0001.
464                    // ──────────────────────────────────────────────────────────────
465                    movzx Rq(TEMP_A), Rb(TEMP_A)     // Rd(TEMP_A) = 0 or 1
466                }
467                self.emit_risc_register_store(TEMP_A, None, rd);
468            }
469            _ => {
470                self.emit_risc_operand_load(rs1, TEMP_A);
471                self.emit_risc_operand_load(rs2, TEMP_B);
472                dynasm! {
473                    self;
474                    .arch x64;
475
476                    cmp Rq(TEMP_A), Rq(TEMP_B);
477
478                    // ──────────────────────────────────────────────────────────────
479                    // 2. setl  r/m8
480                    //    • Writes   1  to the target byte if  (SF ≠ OF)
481                    //      which is the signed "less than" condition.
482                    //    • We store straight into the low-byte of TEMP_A —
483                    //      dynasm's `Rb()` gives us that alias.
484                    // ──────────────────────────────────────────────────────────────
485                    setl Rb(TEMP_A);               // byte = 1 if TEMP_A < TEMP_B (signed)
486
487                    // ──────────────────────────────────────────────────────────────
488                    // 3. Zero-extend that byte back to a full 32-bit register so
489                    //    that the RISC register ends up with 0x0000_0000 or 0x0000_0001.
490                    // ──────────────────────────────────────────────────────────────
491                    movzx Rq(TEMP_A), Rb(TEMP_A)     // Rd(TEMP_A) = 0 or 1
492                }
493                self.emit_risc_register_store(TEMP_A, None, rd);
494            }
495        }
496    }
497
498    fn sltu(&mut self, rd: RiscRegister, rs1: RiscOperand, rs2: RiscOperand) {
499        match rs2 {
500            RiscOperand::Immediate(imm) => {
501                self.emit_risc_operand_load(rs1, TEMP_A);
502                dynasm! {
503                    self;
504                    .arch x64;
505
506                    cmp Rq(TEMP_A), imm;
507
508                    // ------------------------------------
509                    // `setb` ("below") checks the Carry Flag (CF):
510                    //   CF = 1  iff  TEMP_A < imm  in an *unsigned* sense.
511                    // ------------------------------------
512                    setb Rb(TEMP_A);
513
514                    // ------------------------------------
515                    // Zero-extend to 32 bits (0 or 1).
516                    // ------------------------------------
517                    movzx Rq(TEMP_A), Rb(TEMP_A)
518                }
519                self.emit_risc_register_store(TEMP_A, None, rd);
520            }
521            _ => {
522                self.emit_risc_operand_load(rs1, TEMP_A);
523                self.emit_risc_operand_load(rs2, TEMP_B);
524                dynasm! {
525                    self;
526                    .arch x64;
527
528                    cmp Rq(TEMP_A), Rq(TEMP_B);
529
530                    // ------------------------------------
531                    // `setb` ("below") checks the Carry Flag (CF):
532                    //   CF = 1  iff  TEMP_A < TEMP_B  in an *unsigned* sense.
533                    // ------------------------------------
534                    setb Rb(TEMP_A);
535
536                    // ------------------------------------
537                    // Zero-extend to 32 bits (0 or 1).
538                    // ------------------------------------
539                    movzx Rq(TEMP_A), Rb(TEMP_A)
540                }
541                self.emit_risc_register_store(TEMP_A, None, rd);
542            }
543        }
544    }
545
546    fn sub(&mut self, rd: RiscRegister, rs1: RiscOperand, rs2: RiscOperand) {
547        // rd <- rs1 - rs2 (64-bit)
548        impl_risc_alu!(self, rd, rs1, rs2, TEMP_A, TEMP_B, {
549            dynasm! {
550                self;
551                .arch x64;
552                sub Rq(TEMP_A), Rq(TEMP_B)
553            }
554        })
555    }
556
557    fn addw(&mut self, rd: RiscRegister, rs1: RiscOperand, rs2: RiscOperand) {
558        // addw performs 32-bit addition on lower 32 bits, then sign-extends result to 64 bits
559        impl_alu32_imm_opt!(self, rd, rs1, rs2, add);
560    }
561
562    fn subw(&mut self, rd: RiscRegister, rs1: RiscOperand, rs2: RiscOperand) {
563        // subw performs 32-bit subtraction on lower 32 bits, then sign-extends result to 64 bits
564        impl_risc_alu!(self, rd, rs1, rs2, TEMP_A, TEMP_B, {
565            dynasm! {
566                self;
567                .arch x64;
568                sub Rd(TEMP_A), Rd(TEMP_B);
569                movsxd Rq(TEMP_A), Rd(TEMP_A)
570            }
571        })
572    }
573
574    fn sllw(&mut self, rd: RiscRegister, rs1: RiscOperand, rs2: RiscOperand) {
575        // sllw performs 32-bit shift left, then sign-extends result to 64 bits
576        impl_shift32_imm_opt!(self, rd, rs1, rs2, shl);
577    }
578
579    fn srlw(&mut self, rd: RiscRegister, rs1: RiscOperand, rs2: RiscOperand) {
580        // srlw performs logical right shift on lower 32 bits, then sign-extends result to 64 bits
581        impl_shift32_imm_opt!(self, rd, rs1, rs2, shr);
582    }
583
584    fn sraw(&mut self, rd: RiscRegister, rs1: RiscOperand, rs2: RiscOperand) {
585        // sraw performs arithmetic right shift on lower 32 bits, then sign-extends result to 64
586        // bits
587        impl_shift32_imm_opt!(self, rd, rs1, rs2, sar);
588    }
589
590    fn mulw(&mut self, rd: RiscRegister, rs1: RiscOperand, rs2: RiscOperand) {
591        // mulw performs 32-bit multiplication, then sign-extends result to 64 bits
592        impl_risc_alu!(self, rd, rs1, rs2, TEMP_A, TEMP_B, {
593            dynasm! {
594                self;
595                .arch x64;
596
597                // Perform 32-bit multiplication
598                imul Rd(TEMP_A), Rd(TEMP_B);
599
600                // Sign-extend the 32-bit result to 64 bits
601                movsxd Rq(TEMP_A), Rd(TEMP_A)
602            }
603        });
604    }
605
606    fn divw(&mut self, rd: RiscRegister, rs1: RiscOperand, rs2: RiscOperand) {
607        // divw performs 32-bit signed division, then sign-extends result to 64 bits
608        self.emit_risc_operand_load(rs1, Rq::RAX as u8); // Load dividend directly into RAX
609        self.emit_risc_operand_load(rs2, TEMP_B);
610        dynasm! {
611            self;
612            .arch x64;
613
614            // Check for division by zero
615            test Rd(TEMP_B), Rd(TEMP_B);
616            jz >div_by_zero;
617
618            // Handle 32-bit overflow case on x86-64: INT_MIN / -1 traps (#DE)
619            cmp eax, i32::MIN;               // dividend == INT_MIN?
620            jne >do_div;
621            cmp Rd(TEMP_B), -1;              // divisor == -1?
622            jne >do_div;
623            mov eax, i32::MIN;               // result = INT_MIN
624            movsxd rax, eax;                 // sign-extend to 64-bit
625            jmp >done;
626
627            do_div:;
628            // Perform signed 32-bit divide
629            // dividend already in EAX (loaded directly into RAX)
630            cdq;                           // sign-extend EAX into EDX
631            idiv Rd(TEMP_B);               // quotient → EAX
632            movsxd rax, eax;               // sign-extend result to 64 bits
633            jmp >done;
634
635            // Handle overflow: i32::MIN / -1 = i32::MIN (wrapping)
636            overflow:;
637            mov rax, i32::MIN;
638            jmp >done;
639
640            div_by_zero:;
641            // For RV64I, divw by zero returns 0xFFFFFFFFFFFFFFFF (-1 sign-extended)
642            mov rax, -1;
643
644            done:
645        }
646        self.emit_risc_register_store(Rq::RAX as u8, None, rd);
647    }
648
649    fn divuw(&mut self, rd: RiscRegister, rs1: RiscOperand, rs2: RiscOperand) {
650        // divuw performs 32-bit unsigned division, then sign-extends result to 64 bits
651        self.emit_risc_operand_load(rs1, Rq::RAX as u8); // Load dividend directly into RAX
652        self.emit_risc_operand_load(rs2, TEMP_B);
653        dynasm! {
654            self;
655            .arch x64;
656
657            // Check for division by zero
658            test Rd(TEMP_B), Rd(TEMP_B);
659            jz >div_by_zero;
660
661            // Perform unsigned 32-bit divide
662            // dividend already in EAX (loaded directly into RAX)
663            xor edx, edx;               // zero-extend
664            div Rd(TEMP_B);                // quotient → EAX
665            movsxd rax, eax;               // sign-extend result to 64 bits
666            jmp >done;
667
668            div_by_zero:;
669            // For RV64I, divuw by zero returns 0xFFFFFFFFFFFFFFFF (-1 sign-extended)
670            mov rax, -1;
671
672            done:
673        }
674        self.emit_risc_register_store(Rq::RAX as u8, None, rd);
675    }
676
677    fn remw(&mut self, rd: RiscRegister, rs1: RiscOperand, rs2: RiscOperand) {
678        // remw performs 32-bit signed remainder, then sign-extends result to 64 bits
679        self.emit_risc_operand_load(rs1, Rq::RAX as u8); // Load dividend directly into RAX
680        self.emit_risc_operand_load(rs2, TEMP_B);
681        dynasm! {
682            self;
683            .arch x64;
684
685            // Check for division by zero
686            test Rd(TEMP_B), Rd(TEMP_B);
687            jz >rem_by_zero;
688
689            // Handle 32-bit overflow case on x86-64: INT_MIN / -1 traps (#DE)
690            cmp eax, i32::MIN;   // dividend == INT_MIN?
691            jne >do_div;
692            cmp Rd(TEMP_B), -1;              // divisor == -1?
693            jne >do_div;
694            mov eax, i32::MIN;   // result = INT_MIN
695            movsxd rax, eax;                 // sign-extend to 64-bit
696            jmp >done;
697
698            do_div:;
699            // Perform signed 32-bit remainder
700            // dividend already in EAX (loaded directly into RAX)
701            cdq;                        // sign-extend EAX into EDX
702            idiv Rd(TEMP_B);               // remainder → EDX
703            movsxd rdx, edx;               // sign-extend result to 64 bits
704            jmp >done;
705
706            // Handle overflow: i32::MIN % -1 = 0 (wrapping)
707            overflow:;
708            xor rdx, rdx;                  // remainder = 0
709            jmp >done;
710
711            rem_by_zero:;
712            // For RV64I, remw by zero returns the dividend (RAX) sign-extended
713            movsxd rdx, eax;
714
715            done:
716        }
717        self.emit_risc_register_store(Rq::RDX as u8, None, rd);
718    }
719
720    fn remuw(&mut self, rd: RiscRegister, rs1: RiscOperand, rs2: RiscOperand) {
721        // remuw performs 32-bit unsigned remainder, then sign-extends result to 64 bits
722        self.emit_risc_operand_load(rs1, Rq::RAX as u8); // Load dividend directly into RAX
723        self.emit_risc_operand_load(rs2, TEMP_B);
724        dynasm! {
725            self;
726            .arch x64;
727
728            // Check for division by zero
729            test Rd(TEMP_B), Rd(TEMP_B);
730            jz >rem_by_zero;
731
732            // Perform unsigned 32-bit remainder
733            // dividend already in EAX (loaded directly into RAX)
734            xor edx, edx;               // zero-extend (clear upper 32 bits)
735            div Rd(TEMP_B);                // remainder → EDX
736            movsxd rdx, edx;               // sign-extend result to 64 bits
737            jmp >done;
738
739            rem_by_zero:;
740            // For RV64I, remuw by zero returns the dividend (RAX) sign-extended
741            movsxd rdx, eax;
742
743            done:
744        }
745        self.emit_risc_register_store(Rq::RDX as u8, None, rd);
746    }
747
748    fn auipc(&mut self, rd: RiscRegister, imm: u64) {
749        // rd <- pc + imm
750
751        // ------------------------------------
752        // 1. Copy the current PC into TEMP_A
753        // 2. Increment the PC by the immediate.
754        // ------------------------------------
755        let value = self.pc_current.wrapping_add(imm);
756
757        // Store the result in the destination register.
758        self.emit_risc_register_store(TEMP_A, Some(value), rd);
759    }
760
761    fn lui(&mut self, rd: RiscRegister, imm: u64) {
762        // rd <- imm << 12
763        // LUI loads a 20-bit immediate shifted left by 12 bits into the destination register
764        dynasm! {
765            self;
766            .arch x64;
767
768            mov Rq(TEMP_A), imm as i32
769        }
770
771        // Store the result in the destination register.
772        self.emit_risc_register_store(TEMP_A, None, rd);
773    }
774}
775
776impl ControlFlowInstructions for TranspilerBackend {
777    fn jal(&mut self, rd: RiscRegister, imm: u64) {
778        // Mark that a control flow instruction has been inserted.
779        self.control_flow_instruction_inserted = true;
780
781        let target_pc = self.pc_current.wrapping_add(imm);
782        let next_pc = self.pc_current.wrapping_add(4);
783
784        // Store the current PC + 4 into the destination register.
785        self.emit_risc_register_store(TEMP_A, Some(next_pc), rd);
786
787        // Adjust the PC store in the context by the immediate.
788        self.update_pc(TEMP_B, target_pc);
789
790        // Add the base amount of cycles for the instruction.
791        self.bump_clk();
792
793        // We know the jump target at transpile time, we can issue jump
794        // to it directly, skipping jump table
795        self.end_branch(Some(target_pc));
796    }
797
798    fn jalr(&mut self, rd: RiscRegister, rs1: RiscRegister, imm: u64) {
799        // Mark that a control flow instruction has been inserted.
800        self.control_flow_instruction_inserted = true;
801
802        // ------------------------------------
803        // 1. If rs1 is immediate, we can do fast jumping
804        // ------------------------------------
805        let jump_target =
806            self.reg_values.get(&rs1).map(|rs1_imm| rs1_imm.wrapping_add(imm) & !1_u64);
807
808        // ------------------------------------
809        // 2. Update PC value
810        // ------------------------------------
811        self.emit_risc_operand_load(rs1.into(), TEMP_A);
812        dynasm! {
813            self;
814            .arch x64;
815
816            add Rq(TEMP_A), imm as i32;
817            and Rq(TEMP_A), -2;
818            mov QWORD [Rq(CONTEXT) + PC_OFFSET], Rq(TEMP_A)
819        }
820
821        // ------------------------------------
822        // 3. Compute & store next PC into rd.
823        // ------------------------------------
824        let next_pc = self.pc_current + 4;
825        self.emit_risc_register_store(TEMP_B, Some(next_pc), rd);
826
827        // Add the base amount of cycles for the instruction.
828        self.bump_clk();
829
830        self.end_branch(jump_target);
831    }
832
833    fn beq(&mut self, rs1: RiscRegister, rs2: RiscRegister, imm: u64) {
834        // Mark that a control flow instruction has been inserted.
835        self.control_flow_instruction_inserted = true;
836
837        // Add the base amount of cycles for the instruction.
838        self.bump_clk();
839
840        self.emit_risc_operand_load(rs1.into(), TEMP_A);
841        self.emit_risc_operand_load(rs2.into(), TEMP_B);
842
843        let branched_target = self.pc_current.wrapping_add(imm);
844        let not_branched_target = self.pc_current.wrapping_add(4);
845
846        // Compare the registers
847        dynasm! {
848            self;
849            .arch x64;
850
851            // Check if rs1 == rs2
852            cmp Rq(TEMP_A), Rq(TEMP_B);
853            // If rs1 != rs2, jump to not_branched, since that would imply !(rs1 == rs2)
854            jne >not_branched
855        }
856        // ------------------------------------
857        // Branched:
858        // 0. Bump the pc by the immediate.
859        // ------------------------------------
860        self.update_pc(Rq::RAX as u8, branched_target);
861        self.end_branch(Some(branched_target));
862
863        dynasm! {
864            self;
865            .arch x64;
866
867            // ------------------------------------
868            // Not branched:
869            // ------------------------------------
870            not_branched:
871        }
872        // ------------------------------------
873        // 1. Bump the pc by 4
874        // ------------------------------------
875        self.update_pc(Rq::RAX as u8, not_branched_target);
876        self.end_branch(Some(not_branched_target));
877    }
878
879    fn bge(&mut self, rs1: RiscRegister, rs2: RiscRegister, imm: u64) {
880        // Mark that a control flow instruction has been inserted.
881        self.control_flow_instruction_inserted = true;
882
883        // Add the base amount of cycles for the instruction.
884        self.bump_clk();
885
886        self.emit_risc_operand_load(rs1.into(), TEMP_A);
887        self.emit_risc_operand_load(rs2.into(), TEMP_B);
888
889        let branched_target = self.pc_current.wrapping_add(imm);
890        let not_branched_target = self.pc_current.wrapping_add(4);
891
892        dynasm! {
893            self;
894            .arch x64;
895
896            // Check if rs1 == rs2
897            cmp Rq(TEMP_A), Rq(TEMP_B);
898            // If rs1 < rs2, jump to not_branched, since that would imply !(rs1 >= rs2)
899            jl >not_branched
900        }
901        // ------------------------------------
902        // Branched:
903        // 0. Bump the pc by the immediate.
904        // ------------------------------------
905        self.update_pc(Rq::RAX as u8, branched_target);
906        self.end_branch(Some(branched_target));
907
908        dynasm! {
909            self;
910            .arch x64;
911
912            // ------------------------------------
913            // Not branched:
914            // ------------------------------------
915            not_branched:
916        }
917        // ------------------------------------
918        // 1. Bump the pc by 4
919        // ------------------------------------
920        self.update_pc(Rq::RAX as u8, not_branched_target);
921        self.end_branch(Some(not_branched_target));
922    }
923
924    fn bgeu(&mut self, rs1: RiscRegister, rs2: RiscRegister, imm: u64) {
925        // Mark that a control flow instruction has been inserted.
926        self.control_flow_instruction_inserted = true;
927
928        // Add the base amount of cycles for the instruction.
929        self.bump_clk();
930
931        self.emit_risc_operand_load(rs1.into(), TEMP_A);
932        self.emit_risc_operand_load(rs2.into(), TEMP_B);
933
934        let branched_target = self.pc_current.wrapping_add(imm);
935        let not_branched_target = self.pc_current.wrapping_add(4);
936
937        dynasm! {
938            self;
939            .arch x64;
940
941            cmp Rq(TEMP_A), Rq(TEMP_B);
942            // If rs1 < rs2, jump to not_branched, since that would imply !(rs1 >= rs2)
943            jb >not_branched
944        }
945        // ------------------------------------
946        // Branched:
947        // 0. Bump the pc by the immediate.
948        // ------------------------------------
949        self.update_pc(Rq::RAX as u8, branched_target);
950        self.end_branch(Some(branched_target));
951
952        dynasm! {
953            self;
954            .arch x64;
955
956            // ------------------------------------
957            // Not branched:
958            // ------------------------------------
959            not_branched:
960        }
961        // ------------------------------------
962        // 1. Bump the pc by 4
963        // ------------------------------------
964        self.update_pc(Rq::RAX as u8, not_branched_target);
965        self.end_branch(Some(not_branched_target));
966    }
967
968    fn blt(&mut self, rs1: RiscRegister, rs2: RiscRegister, imm: u64) {
969        // Mark that a control flow instruction has been inserted.
970        self.control_flow_instruction_inserted = true;
971
972        // Add the base amount of cycles for the instruction.
973        self.bump_clk();
974
975        self.emit_risc_operand_load(rs1.into(), TEMP_A);
976        self.emit_risc_operand_load(rs2.into(), TEMP_B);
977
978        let branched_target = self.pc_current.wrapping_add(imm);
979        let not_branched_target = self.pc_current.wrapping_add(4);
980
981        dynasm! {
982            self;
983            .arch x64;
984
985            // ------------------------------------
986            // Compare the two registers.
987            //
988            cmp Rq(TEMP_A), Rq(TEMP_B);   // signed compare
989            jge >not_branched             // rs1 ≥ rs2  →  skip
990        }
991        // ------------------------------------
992        // Branched:
993        // 0. Bump the pc by the immediate.
994        // ------------------------------------
995        self.update_pc(Rq::RAX as u8, branched_target);
996        self.end_branch(Some(branched_target));
997
998        dynasm! {
999            self;
1000            .arch x64;
1001
1002            // ------------------------------------
1003            // Not branched:
1004            // ------------------------------------
1005            not_branched:
1006        }
1007        // ------------------------------------
1008        // 1. Bump the pc by 4
1009        // ------------------------------------
1010        self.update_pc(Rq::RAX as u8, not_branched_target);
1011        self.end_branch(Some(not_branched_target));
1012    }
1013
1014    fn bltu(&mut self, rs1: RiscRegister, rs2: RiscRegister, imm: u64) {
1015        // Mark that a control flow instruction has been inserted.
1016        self.control_flow_instruction_inserted = true;
1017
1018        // Add the base amount of cycles for the instruction.
1019        self.bump_clk();
1020
1021        self.emit_risc_operand_load(rs1.into(), TEMP_A);
1022        self.emit_risc_operand_load(rs2.into(), TEMP_B);
1023
1024        let branched_target = self.pc_current.wrapping_add(imm);
1025        let not_branched_target = self.pc_current.wrapping_add(4);
1026
1027        dynasm! {
1028            self;
1029            .arch x64;
1030            cmp Rq(TEMP_A), Rq(TEMP_B);   // unsigned compare
1031            jae >not_branched             // rs1 ≥ rs2 (unsigned) → skip
1032        }
1033        // ------------------------------------
1034        // Branched:
1035        // 0. Bump the pc by the immediate.
1036        // ------------------------------------
1037        self.update_pc(Rq::RAX as u8, branched_target);
1038        self.end_branch(Some(branched_target));
1039
1040        dynasm! {
1041            self;
1042            .arch x64;
1043
1044            // ------------------------------------
1045            // Not branched:
1046            // ------------------------------------
1047            not_branched:
1048        }
1049        // ------------------------------------
1050        // 1. Bump the pc by 4
1051        // ------------------------------------
1052        self.update_pc(Rq::RAX as u8, not_branched_target);
1053        self.end_branch(Some(not_branched_target));
1054    }
1055
1056    fn bne(&mut self, rs1: RiscRegister, rs2: RiscRegister, imm: u64) {
1057        // Mark that a control flow instruction has been inserted.
1058        self.control_flow_instruction_inserted = true;
1059
1060        // Add the base amount of cycles for the instruction.
1061        self.bump_clk();
1062
1063        self.emit_risc_operand_load(rs1.into(), TEMP_A);
1064        self.emit_risc_operand_load(rs2.into(), TEMP_B);
1065
1066        let branched_target = self.pc_current.wrapping_add(imm);
1067        let not_branched_target = self.pc_current.wrapping_add(4);
1068
1069        dynasm! {
1070            self;
1071            .arch x64;
1072            cmp Rq(TEMP_A), Rq(TEMP_B);   // sets ZF
1073            je  >not_branched             // rs1 == rs2  →  skip
1074        }
1075        // ------------------------------------
1076        // Branched:
1077        // 0. Bump the pc by the immediate.
1078        // ------------------------------------
1079        self.update_pc(Rq::RAX as u8, branched_target);
1080        self.end_branch(Some(branched_target));
1081
1082        dynasm! {
1083            self;
1084            .arch x64;
1085
1086            // ------------------------------------
1087            // Not branched:
1088            // ------------------------------------
1089            not_branched:
1090        }
1091        // ------------------------------------
1092        // 1. Bump the pc by 4
1093        // ------------------------------------
1094        self.update_pc(Rq::RAX as u8, not_branched_target);
1095        self.end_branch(Some(not_branched_target));
1096    }
1097}
1098
1099impl MemoryInstructions for TranspilerBackend {
1100    fn lb(&mut self, rd: RiscRegister, rs1: RiscRegister, imm: u64) {
1101        self.may_early_exit = true;
1102
1103        // ------------------------------------
1104        // Load in the base address and the phy sical memory pointer.
1105        // ------------------------------------
1106        self.emit_risc_operand_load(rs1.into(), TEMP_A);
1107
1108        dynasm! {
1109            self;
1110            .arch x64;
1111
1112            // ------------------------------------
1113            // Add the immediate to the base address
1114            // Scaled to account for the entry size.
1115            //
1116            // TEMP_A = rs1 + imm = addr
1117            // ------------------------------------
1118            add Rq(TEMP_A), imm as i32;
1119
1120            // ------------------------------------
1121            // Store the intra-word offset.
1122            // ------------------------------------
1123            mov rax, Rq(TEMP_A);
1124            and rax, 7;
1125
1126            // ------------------------------------
1127            // Align to the start of the word.
1128            //
1129            // Scale to account for the entry size.
1130            // ------------------------------------
1131            and Rq(TEMP_A), -8;
1132            shl Rq(TEMP_A), 1;
1133
1134            // ------------------------------------
1135            // Add the risc32 byte offset to the physical memory pointer
1136            //
1137            // TEMP_A = addr + physical_memory_pointer
1138            // ------------------------------------
1139            add Rq(TEMP_A), Rq(MEMORY_PTR);
1140
1141            // ------------------------------------
1142            // 4. Load byte → sign-extend to 32 bits
1143            //
1144            // TEMP_B = clk
1145            // TEMP_A = addr + physical_memory_pointer
1146            // [addr + physical_memory_pointer] = clk
1147            // TEMP_A = [addr + physical_memory_pointer + 8]
1148            // ------------------------------------
1149            movsx Rq(TEMP_A), BYTE [Rq(TEMP_A) + 8 + rax]
1150        }
1151
1152        // 4. Write back to destination register
1153        self.emit_risc_register_store(TEMP_A, None, rd);
1154    }
1155
1156    fn lbu(&mut self, rd: RiscRegister, rs1: RiscRegister, imm: u64) {
1157        self.may_early_exit = true;
1158
1159        // ------------------------------------
1160        // Load in the base address
1161        // and the physical memory pointer.
1162        // ------------------------------------
1163        self.emit_risc_operand_load(rs1.into(), TEMP_A);
1164
1165        dynasm! {
1166            self;
1167            .arch x64;
1168
1169            // ------------------------------------
1170            // Add the immediate to the base address
1171            //
1172            // TEMP_A = rs1 + imm = addr
1173            // ------------------------------------
1174            add Rq(TEMP_A), imm as i32;
1175
1176            // ------------------------------------
1177            // Store the intra-word offset.
1178            // ------------------------------------
1179            mov rax, Rq(TEMP_A);
1180            and rax, 7;
1181
1182            // ------------------------------------
1183            // Align to the start of the word.
1184            //
1185            // Scale to account for the entry size.
1186            // ------------------------------------
1187            and Rq(TEMP_A), -8;
1188            shl Rq(TEMP_A), 1;
1189
1190            // ------------------------------------
1191            // Add the risc32 byte offset to the physical memory pointer
1192            //
1193            // TEMP_A = addr + physical_memory_pointer
1194            // ------------------------------------
1195            add Rq(TEMP_A), Rq(MEMORY_PTR);
1196
1197            // ------------------------------------
1198            // Load byte → zero-extend to 32 bits
1199            // ------------------------------------
1200            movzx Rq(TEMP_A), BYTE [Rq(TEMP_A) + 8 + rax]
1201        }
1202
1203        self.emit_risc_register_store(TEMP_A, None, rd);
1204    }
1205
1206    fn lh(&mut self, rd: RiscRegister, rs1: RiscRegister, imm: u64) {
1207        self.may_early_exit = true;
1208
1209        // ------------------------------------
1210        // Load in the base address
1211        // and the physical memory pointer.
1212        // ------------------------------------
1213        self.emit_risc_operand_load(rs1.into(), TEMP_A);
1214
1215        dynasm! {
1216            self;
1217            .arch x64;
1218
1219            // ------------------------------------
1220            // Add the immediate to the base address
1221            //
1222            // TEMP_A = rs1 + imm = addr
1223            // ------------------------------------
1224            add Rq(TEMP_A), imm as i32;
1225
1226             // ------------------------------------
1227            // Store the intra-word offset.
1228            // ------------------------------------
1229            mov rax, Rq(TEMP_A);
1230            and rax, 7;
1231
1232            // ------------------------------------
1233            // Align to the start of the word.
1234            //
1235            // Scale to account for the entry size.
1236            // ------------------------------------
1237            and Rq(TEMP_A), -8;
1238            shl Rq(TEMP_A), 1;
1239
1240            // ------------------------------------
1241            // Add the risc32 byte offset to the physical memory pointer
1242            //
1243            // TEMP_A = addr + physical_memory_pointer
1244            // ------------------------------------
1245            add Rq(TEMP_A), Rq(MEMORY_PTR);
1246
1247            // ------------------------------------
1248            // Load half-word → sign-extend to 32 bits
1249            // ------------------------------------
1250            movsx Rq(TEMP_A), WORD [Rq(TEMP_A) + 8 + rax]
1251        }
1252
1253        self.emit_risc_register_store(TEMP_A, None, rd);
1254    }
1255
1256    fn lhu(&mut self, rd: RiscRegister, rs1: RiscRegister, imm: u64) {
1257        self.may_early_exit = true;
1258
1259        // ------------------------------------
1260        //  Load in the base address
1261        //  and the physical memory pointer.
1262        // ------------------------------------
1263        self.emit_risc_operand_load(rs1.into(), TEMP_A);
1264
1265        dynasm! {
1266            self;
1267            .arch x64;
1268
1269            // ------------------------------------
1270            // Add the immediate to the base address
1271            //
1272            // TEMP_A = rs1 + imm = addr
1273            // ------------------------------------
1274            add Rq(TEMP_A), imm as i32;
1275
1276            // ------------------------------------
1277            // Store the intra-word offset.
1278            // ------------------------------------
1279            mov rax, Rq(TEMP_A);
1280            and rax, 7;
1281
1282            // ------------------------------------
1283            // Align to the start of the word.
1284            //
1285            // Scale to account for the entry size.
1286            // ------------------------------------
1287            and Rq(TEMP_A), -8;
1288            shl Rq(TEMP_A), 1;
1289
1290            // ------------------------------------
1291            // Add the risc32 byte offset to the physical memory pointer
1292            //
1293            // TEMP_A = addr + physical_memory_pointer
1294            // ------------------------------------
1295            add Rq(TEMP_A), Rq(MEMORY_PTR);
1296
1297            // ------------------------------------
1298            // Load 16 bits, zero-extend to 32 bits
1299            // ------------------------------------
1300            movzx Rq(TEMP_A), WORD [Rq(TEMP_A) + 8 + rax]
1301        }
1302
1303        self.emit_risc_register_store(TEMP_A, None, rd);
1304    }
1305
1306    fn lw(&mut self, rd: RiscRegister, rs1: RiscRegister, imm: u64) {
1307        self.may_early_exit = true;
1308
1309        // ------------------------------------
1310        // Load the base address into TEMP_A
1311        // and physical memory pointer into TEMP_B
1312        // ------------------------------------
1313        self.emit_risc_operand_load(rs1.into(), TEMP_A);
1314
1315        dynasm! {
1316            self;
1317            .arch x64;
1318
1319            // ------------------------------------
1320            // Add the immediate to the base address
1321            //
1322            // TEMP_A = rs1 + imm = addr
1323            // ------------------------------------
1324            add Rq(TEMP_A), imm as i32;
1325
1326            // ------------------------------------
1327            // Store the intra-word offset.
1328            // ------------------------------------
1329            mov rax, Rq(TEMP_A);
1330            and rax, 7;
1331
1332            // ------------------------------------
1333            // Align to the start of the word.
1334            //
1335            // Scale to account for the entry size.
1336            // ------------------------------------
1337            and Rq(TEMP_A), -8;
1338            shl Rq(TEMP_A), 1;
1339
1340            // ------------------------------------
1341            // 3. Add the risc32 byte offset to the physical memory pointer
1342            //
1343            // TEMP_A = addr + physical_memory_pointer
1344            // ------------------------------------
1345            add Rq(TEMP_A), Rq(MEMORY_PTR);
1346
1347            // ------------------------------------
1348            // 4. Load the word from physical memory into TEMP_A (sign-extended to 64-bit)
1349            // ------------------------------------
1350            movsxd Rq(TEMP_A), DWORD [Rq(TEMP_A) + 8 + rax]
1351        }
1352
1353        // ------------------------------------
1354        // 5. Store the result in the destination register.
1355        // ------------------------------------
1356        self.emit_risc_register_store(TEMP_A, None, rd);
1357    }
1358
1359    fn lwu(&mut self, rd: RiscRegister, rs1: RiscRegister, imm: u64) {
1360        self.may_early_exit = true;
1361
1362        // ------------------------------------
1363        // Load the base address into TEMP_A
1364        // and physical memory pointer into TEMP_B
1365        // ------------------------------------
1366        self.emit_risc_operand_load(rs1.into(), TEMP_A);
1367
1368        dynasm! {
1369            self;
1370            .arch x64;
1371
1372            // ------------------------------------
1373            // Add the immediate to the base address
1374            //
1375            // TEMP_A = rs1 + imm = addr
1376            // ------------------------------------
1377            add Rq(TEMP_A), imm as i32;
1378
1379            // ------------------------------------
1380            // Store the intra-word offset.
1381            // ------------------------------------
1382            mov rax, Rq(TEMP_A);
1383            and rax, 7;
1384
1385            // ------------------------------------
1386            // Align to the start of the word.
1387            //
1388            // Scale to account for the entry size.
1389            // ------------------------------------
1390            and Rq(TEMP_A), -8;
1391            shl Rq(TEMP_A), 1;
1392
1393            // ------------------------------------
1394            // 3. Add the risc32 byte offset to the physical memory pointer
1395            //
1396            // TEMP_A = addr + physical_memory_pointer
1397            // ------------------------------------
1398            add Rq(TEMP_A), Rq(MEMORY_PTR);
1399
1400            // ------------------------------------
1401            // 4. Load the word from physical memory into TEMP_B (zero-extended to 64-bit)
1402            // ------------------------------------
1403            mov Rd(TEMP_A), DWORD [Rq(TEMP_A) + 8 + rax]
1404        }
1405
1406        // ------------------------------------
1407        // 5. Store the result in the destination register.
1408        // ------------------------------------
1409        self.emit_risc_register_store(TEMP_A, None, rd);
1410    }
1411
1412    fn ld(&mut self, rd: RiscRegister, rs1: RiscRegister, imm: u64) {
1413        self.may_early_exit = true;
1414
1415        // ------------------------------------
1416        // 1. Load the base address into TEMP_A
1417        // and physical memory pointer into TEMP_B
1418        // ------------------------------------
1419        self.emit_risc_operand_load(rs1.into(), TEMP_A);
1420
1421        dynasm! {
1422            self;
1423            .arch x64;
1424
1425            // ------------------------------------
1426            //  Add the immediate to the base address
1427            //
1428            // TEMP_A = rs1 + imm = addr
1429            // ------------------------------------
1430            add Rq(TEMP_A), imm as i32;
1431
1432            // ------------------------------------
1433            // Scale to account for the entry size.
1434            //
1435            // Assume the addr is properly aligned.
1436            // ------------------------------------
1437            shl Rq(TEMP_A), 1;
1438
1439            // ------------------------------------
1440            // Add the risc byte offset to the physical memory pointer
1441            //
1442            // TEMP_A = addr + physical_memory_pointer
1443            // ------------------------------------
1444            add Rq(TEMP_A), Rq(MEMORY_PTR);
1445
1446            // ------------------------------------
1447            // Load the word from physical memory into TEMP_A
1448            // ------------------------------------
1449            mov Rq(TEMP_A), QWORD [Rq(TEMP_A) + 8]
1450        }
1451
1452        // ------------------------------------
1453        // Store the result in the destination register.
1454        // ------------------------------------
1455        self.emit_risc_register_store(TEMP_A, None, rd);
1456    }
1457
1458    fn sb(&mut self, rs1: RiscRegister, rs2: RiscRegister, imm: u64) {
1459        self.may_early_exit = true;
1460
1461        // ------------------------------------
1462        // Load the base address into TEMP_A
1463        // and physical memory pointer into TEMP_B
1464        // ------------------------------------
1465        self.emit_risc_operand_load(rs1.into(), TEMP_A);
1466
1467        dynasm! {
1468            self;
1469            .arch x64;
1470
1471            // ------------------------------------
1472            // Add the immediate to the base address
1473            // ------------------------------------
1474            add Rq(TEMP_A), imm as i32;
1475
1476            // ------------------------------------
1477            // Store the intra-word offset.
1478            // ------------------------------------
1479            mov rax, Rq(TEMP_A);
1480            and rax, 7;
1481
1482            // ------------------------------------
1483            // Align to the start of the word.
1484            //
1485            // Scale to account for the entry size.
1486            // ------------------------------------
1487            and Rq(TEMP_A), -8;
1488            shl Rq(TEMP_A), 1;
1489
1490            // ------------------------------------
1491            // Add the risc32 byte offset to the physical memory pointer
1492            // ------------------------------------
1493            add Rq(TEMP_A), Rq(MEMORY_PTR)
1494        }
1495
1496        // ------------------------------------
1497        // Load the word from the RISC register into TEMP_B
1498        // ------------------------------------
1499        self.emit_risc_operand_load(rs2.into(), TEMP_B);
1500
1501        // ------------------------------------
1502        // Store the word into physical memory
1503        // ------------------------------------
1504        dynasm! {
1505            self;
1506            .arch x64;
1507
1508            mov BYTE [Rq(TEMP_A) + 8 + rax], Rb(TEMP_B)
1509        }
1510    }
1511
1512    fn sh(&mut self, rs1: RiscRegister, rs2: RiscRegister, imm: u64) {
1513        self.may_early_exit = true;
1514
1515        // ------------------------------------
1516        // Load the base address into TEMP_A
1517        // and physical memory pointer into TEMP_B
1518        // ------------------------------------
1519        self.emit_risc_operand_load(rs1.into(), TEMP_A);
1520
1521        dynasm! {
1522            self;
1523            .arch x64;
1524
1525            // ------------------------------------
1526            // Add the immediate to the base address
1527            // ------------------------------------
1528            add Rq(TEMP_A), imm as i32;
1529
1530            // ------------------------------------
1531            // Store the intra-word offset.
1532            // ------------------------------------
1533            mov rax, Rq(TEMP_A);
1534            and rax, 7;
1535
1536            // ------------------------------------
1537            // Align to the start of the word.
1538            // Scale to account for the entry size.
1539            // ------------------------------------
1540            and Rq(TEMP_A), -8;
1541            shl Rq(TEMP_A), 1;
1542
1543            // ------------------------------------
1544            // Add the risc32 byte offset to the physical memory pointer
1545            // ------------------------------------
1546            add Rq(TEMP_A), Rq(MEMORY_PTR)
1547        }
1548
1549        // ------------------------------------
1550        // Load the word from the RISC register into TEMP_B
1551        // ------------------------------------
1552        self.emit_risc_operand_load(rs2.into(), TEMP_B);
1553
1554        // ------------------------------------
1555        // Store the word into physical memory
1556        // ------------------------------------
1557        dynasm! {
1558            self;
1559            .arch x64;
1560
1561            mov WORD [Rq(TEMP_A) + 8 + rax], Rw(TEMP_B)
1562        }
1563    }
1564
1565    fn sw(&mut self, rs1: RiscRegister, rs2: RiscRegister, imm: u64) {
1566        self.may_early_exit = true;
1567
1568        // ------------------------------------
1569        // Load the base address into TEMP_A
1570        // and physical memory pointer into TEMP_B
1571        // ------------------------------------
1572        self.emit_risc_operand_load(rs1.into(), TEMP_A);
1573
1574        dynasm! {
1575            self;
1576            .arch x64;
1577
1578            // ------------------------------------
1579            // Add the immediate to the base address
1580            // ------------------------------------
1581            add Rq(TEMP_A), imm as i32;
1582
1583            // ------------------------------------
1584            // Store the intra-word offset.
1585            // ------------------------------------
1586            mov rax, Rq(TEMP_A);
1587            and rax, 7;
1588
1589            // ------------------------------------
1590            // Align to the start of the word.
1591            // Scale to account for the entry size.
1592            // ------------------------------------
1593            and Rq(TEMP_A), -8;
1594            shl Rq(TEMP_A), 1;
1595
1596            // ------------------------------------
1597            // Add the risc32 byte offset to the physical memory pointer
1598            // ------------------------------------
1599            add Rq(TEMP_A), Rq(MEMORY_PTR)
1600        }
1601
1602        // ------------------------------------
1603        // Load the word from the RISC register into TEMP_B
1604        // ------------------------------------
1605        self.emit_risc_operand_load(rs2.into(), TEMP_B);
1606
1607        // ------------------------------------
1608        // Store the word into physical memory
1609        // ------------------------------------
1610        dynasm! {
1611            self;
1612            .arch x64;
1613
1614            mov DWORD [Rq(TEMP_A) + 8 + rax], Rd(TEMP_B)
1615        }
1616    }
1617
1618    fn sd(&mut self, rs1: RiscRegister, rs2: RiscRegister, imm: u64) {
1619        self.may_early_exit = true;
1620
1621        // ------------------------------------
1622        // Load the base address into TEMP_A
1623        // and physical memory pointer into TEMP_B
1624        // ------------------------------------
1625        self.emit_risc_operand_load(rs1.into(), TEMP_A);
1626
1627        dynasm! {
1628            self;
1629            .arch x64;
1630
1631            // ------------------------------------
1632            // Add the immediate to the base address
1633            // ------------------------------------
1634            add Rq(TEMP_A), imm as i32;
1635
1636            // ------------------------------------
1637            // Scale to account for the entry size.
1638            //
1639            // Assume the addr is properly aligned.
1640            // ------------------------------------
1641            shl Rq(TEMP_A), 1;
1642
1643            // ------------------------------------
1644            // 3. Add the risc32 byte offset to the physical memory pointer
1645            // ------------------------------------
1646            add Rq(TEMP_A), Rq(MEMORY_PTR)
1647        }
1648
1649        // ------------------------------------
1650        // Load the word from the RISC register into TEMP_B
1651        // ------------------------------------
1652        self.emit_risc_operand_load(rs2.into(), TEMP_B);
1653
1654        // ------------------------------------
1655        // Store the word into physical memory
1656        // ------------------------------------
1657        dynasm! {
1658            self;
1659            .arch x64;
1660
1661            mov QWORD [Rq(TEMP_A) + 8], Rq(TEMP_B)
1662        }
1663    }
1664}
1665
1666impl SystemInstructions for TranspilerBackend {
1667    fn ecall(&mut self) {
1668        // Mark that a control flow instruction has been inserted.
1669        self.control_flow_instruction_inserted = true;
1670        self.may_early_exit = true;
1671
1672        // Load the JitContext pointer into the argument register.
1673        dynasm! {
1674            self;
1675            .arch x64;
1676            mov rdi, Rq(CONTEXT)
1677        };
1678
1679        // `sp1_ecall_handler` bumps PC for syscalls. So we just need
1680        // to set current PC.
1681        self.update_pc(TEMP_A, self.pc_current);
1682
1683        self.call_extern_fn_raw(self.ecall_handler as _);
1684
1685        // The ecall returns a u64 in RAX.
1686        self.emit_risc_register_store(Rq::RAX as u8, None, RiscRegister::X5);
1687
1688        // Add the base amount of cycles for the instruction.
1689        self.bump_clk();
1690
1691        self.end_branch(None);
1692    }
1693
1694    fn unimp(&mut self) {
1695        extern "C" fn unimp(ctx: *mut JitContext) {
1696            let ctx = unsafe { &mut *ctx };
1697            eprintln!("Unimplemented instruction at pc: {}", ctx.pc);
1698            // Trap via SIGILL so the parent maps this to a typed `Unimplemented`
1699            // cause instead of a generic abort.
1700            unsafe { libc::raise(libc::SIGILL) };
1701        }
1702
1703        self.update_pc(TEMP_A, self.pc_current);
1704        self.call_extern_fn(unimp);
1705    }
1706}