Skip to main content

wasm_pvm/llvm_backend/
emitter.rs

1// Core PVM emitter and value management.
2//
3// Each SSA value gets a dedicated stack slot (correctness-first, no register allocation).
4// Pattern: load operands from slots → temp regs, compute, store result to slot.
5
6// We use 'as' casts extensively for:
7// - PVM register indices (u8) from iterators
8// - Address offsets (i32) from pointers
9// - Immediate values (i32/i64) from LLVM constants
10// These are intentional truncations/wraps where we know the range is safe or valid for PVM.
11#![allow(
12    clippy::cast_possible_truncation,
13    clippy::cast_possible_wrap,
14    clippy::cast_sign_loss
15)]
16
17use std::collections::{HashMap, HashSet};
18
19use inkwell::IntPredicate;
20use inkwell::basic_block::BasicBlock;
21use inkwell::values::{
22    AnyValue, AsValueRef, BasicValueEnum, FunctionValue, InstructionValue, IntValue, PhiValue,
23};
24
25use crate::pvm::Instruction;
26use crate::{Error, Result};
27
28use crate::abi::{FRAME_HEADER_SIZE, STACK_PTR_REG};
29use crate::translate::OptimizationFlags;
30
31use super::regalloc::RegAllocResult;
32
33/// Context for lowering functions from a single WASM module.
34pub struct LoweringContext {
35    pub wasm_memory_base: i32,
36    pub num_globals: usize,
37    /// Base address for the parameter overflow area (5th+ args for `call_indirect`).
38    pub param_overflow_base: i32,
39    pub function_signatures: Vec<(usize, bool)>,
40    pub type_signatures: Vec<(usize, usize)>,
41    pub function_table: Vec<u32>,
42    pub num_imported_funcs: usize,
43    pub imported_func_names: Vec<String>,
44    pub initial_memory_pages: u32,
45    pub max_memory_pages: u32,
46    pub stack_size: u32,
47    /// Map from data segment index to offset in `RO_DATA` (for passive segments).
48    pub data_segment_offsets: HashMap<u32, u32>,
49    /// Map from data segment index to byte length (for passive segments bounds checking).
50    pub data_segment_lengths: HashMap<u32, u32>,
51    /// Map from data segment index to PVM address storing the effective length at runtime.
52    /// Used by `memory.init` for bounds checking and by `data.drop` to zero the length.
53    pub data_segment_length_addrs: HashMap<u32, i32>,
54    /// User-provided mapping from WASM import names to actions (trap, nop).
55    pub wasm_import_map: Option<HashMap<String, crate::translate::ImportAction>>,
56    /// Optimization flags controlling which compiler passes are enabled.
57    pub optimizations: OptimizationFlags,
58}
59
60/// Result of lowering one LLVM function to PVM instructions.
61pub struct LlvmFunctionTranslation {
62    pub instructions: Vec<Instruction>,
63    pub call_fixups: Vec<LlvmCallFixup>,
64    pub indirect_call_fixups: Vec<LlvmIndirectCallFixup>,
65    /// Number of call return addresses allocated by this function (for jump table indexing).
66    pub num_call_returns: usize,
67    /// Stats collected during lowering (for compiler output).
68    pub lowering_stats: FunctionLoweringStats,
69}
70
71/// Stats collected during function lowering for the compile stats output.
72#[derive(Debug, Clone, Default)]
73pub struct FunctionLoweringStats {
74    pub frame_size: i32,
75    pub is_leaf: bool,
76    /// Instruction count before dead store elimination.
77    pub pre_dse_instructions: usize,
78    /// Instruction count after DSE but before peephole.
79    pub pre_peephole_instructions: usize,
80    // ── Regalloc stats ──
81    pub regalloc_total_values: usize,
82    pub regalloc_allocated_values: usize,
83    /// Physical register numbers allocated (e.g. [9, 10]).
84    pub regalloc_registers_used: Vec<u8>,
85    pub regalloc_skipped_reason: Option<&'static str>,
86    // ── Regalloc usage stats ──
87    pub regalloc_load_hits: usize,
88    pub regalloc_load_reloads: usize,
89    pub regalloc_load_moves: usize,
90    pub regalloc_store_hits: usize,
91    pub regalloc_store_moves: usize,
92}
93
94#[derive(Debug, Clone)]
95pub struct LlvmCallFixup {
96    pub return_addr_instr: usize,
97    pub jump_instr: usize,
98    pub target_func: u32,
99}
100
101#[derive(Debug, Clone)]
102pub struct LlvmIndirectCallFixup {
103    pub return_addr_instr: usize,
104    // Instruction index whose end offset is used as the call return PC.
105    // For `LoadImmJumpInd`, this is the same index as `return_addr_instr`.
106    pub jump_ind_instr: usize,
107}
108
109/// Per-function configuration for the PVM emitter.
110///
111/// These fields are set once when lowering begins and are never mutated during codegen.
112/// Separating them from mutable state makes it clear what's fixed vs what changes.
113#[allow(clippy::struct_excessive_bools)]
114pub struct EmitterConfig {
115    /// WASM memory base address (for converting WASM addresses to PVM addresses).
116    pub wasm_memory_base: i32,
117
118    /// Base address for the parameter overflow area (5th+ args for `call_indirect`).
119    pub param_overflow_base: i32,
120
121    /// Whether the register cache (store-load forwarding) is enabled.
122    pub register_cache_enabled: bool,
123
124    /// Whether constant propagation (redundant `LoadImm` elimination) is enabled.
125    pub constant_propagation_enabled: bool,
126
127    /// Whether ICmp+Branch fusion is enabled.
128    pub icmp_fusion_enabled: bool,
129
130    /// Whether callee-save shrink wrapping is enabled.
131    pub shrink_wrap_enabled: bool,
132
133    /// Whether cross-block register cache propagation is enabled.
134    pub cross_block_cache_enabled: bool,
135
136    /// Whether register allocation (long-lived values in allocatable regs) is enabled.
137    pub register_allocation_enabled: bool,
138
139    /// Whether fallthrough jump elimination is enabled.
140    pub fallthrough_jumps_enabled: bool,
141
142    /// Whether lazy spill is enabled (skip stack stores for register-allocated values).
143    pub lazy_spill_enabled: bool,
144}
145
146/// PVM code emitter for a single function.
147pub struct PvmEmitter<'ctx> {
148    /// Per-function configuration (immutable after construction).
149    pub(crate) config: EmitterConfig,
150
151    pub(crate) instructions: Vec<Instruction>,
152    pub(crate) labels: Vec<Option<usize>>,
153    pub(crate) fixups: Vec<(usize, usize)>,
154
155    /// LLVM basic block → PVM label.
156    pub(crate) block_labels: HashMap<BasicBlock<'ctx>, usize>,
157
158    /// LLVM int values (params + instruction results) → stack slot offset from SP.
159    pub(crate) value_slots: HashMap<ValKey, i32>,
160
161    /// Next available slot offset (bump allocator, starts after frame header).
162    pub(crate) next_slot_offset: i32,
163
164    /// Total frame size (set after pre-scan).
165    pub(crate) frame_size: i32,
166
167    pub(crate) call_fixups: Vec<LlvmCallFixup>,
168    pub(crate) indirect_call_fixups: Vec<LlvmIndirectCallFixup>,
169
170    /// Current byte offset of the emitted code (for O(1) offset calculations).
171    pub(crate) byte_offset: usize,
172
173    /// Maps stack slot offset → register that currently holds this slot's value.
174    slot_cache: HashMap<i32, u8>,
175    /// Reverse: register → slot offset it holds (for fast invalidation).
176    reg_to_slot: [Option<i32>; 13],
177
178    /// Pending fused `ICmp`: when an `ICmp` has a single use (by a branch), we defer
179    /// it and store the predicate + operands here. The branch will emit a fused
180    /// branch instruction instead of loading the boolean result.
181    pub(crate) pending_fused_icmp: Option<FusedIcmp<'ctx>>,
182
183    /// Maps register → known constant value currently held (for constant propagation).
184    /// When a `LoadImm`/`LoadImm64` is about to be emitted, we check if the target
185    /// register already holds the same constant and skip the load if so.
186    reg_to_const: [Option<u64>; 13],
187
188    /// Next jump table index for call return addresses.
189    /// Incremented each time a call is emitted, starting from `call_return_base_idx`.
190    pub(crate) next_call_return_idx: usize,
191
192    /// Base index for call return addresses (set from the global counter).
193    call_return_base_idx: usize,
194
195    /// For each block with exactly one predecessor, maps block → its single predecessor.
196    /// Used for cross-block register cache propagation.
197    pub(crate) block_single_pred: HashMap<BasicBlock<'ctx>, BasicBlock<'ctx>>,
198
199    /// Which callee-saved registers (r9-r12) are actually used by this function.
200    /// Index 0 = r9, 1 = r10, 2 = r11, 3 = r12.
201    pub(crate) used_callee_regs: [bool; 4],
202
203    /// Frame offset for each callee-saved register (r9-r12), if saved.
204    /// Index 0 = r9, 1 = r10, 2 = r11, 3 = r12.
205    pub(crate) callee_save_offsets: [Option<i32>; 4],
206
207    /// Whether the function contains any call instructions.
208    /// If false (leaf function), we can skip saving/restoring the return address register.
209    pub(crate) has_calls: bool,
210
211    /// Register allocation results (empty if disabled).
212    pub(crate) regalloc: RegAllocResult,
213
214    /// Which stack slot is currently materialized in each allocated register.
215    /// Multiple slots may map to the same physical register across disjoint
216    /// intervals; this tracks runtime ownership so stale values are reloaded.
217    alloc_reg_slot: [Option<i32>; 13],
218
219    /// Dirty bit per register: true when the register holds a value not yet
220    /// written to its stack slot (lazy spill). Only meaningful when
221    /// `config.lazy_spill_enabled` is true.
222    alloc_dirty: [bool; 13],
223
224    /// Runtime usage counters for register-allocated mappings.
225    pub(crate) regalloc_usage: RegAllocUsageStats,
226
227    /// Label of the next basic block in layout order.
228    /// When set, `emit_jump_to_label()` skips the Jump if the target matches,
229    /// letting execution fall through to the next block naturally.
230    pub next_block_label: Option<usize>,
231}
232
233/// Snapshot of the register cache state for cross-block propagation.
234#[derive(Clone)]
235pub struct CacheSnapshot {
236    pub slot_cache: HashMap<i32, u8>,
237    pub reg_to_slot: [Option<i32>; 13],
238    pub reg_to_const: [Option<u64>; 13],
239    pub alloc_reg_slot: [Option<i32>; 13],
240    pub alloc_dirty: [bool; 13],
241}
242
243/// Instrumentation counters describing how much allocated mappings are used by codegen.
244#[derive(Debug, Clone, Default)]
245pub struct RegAllocUsageStats {
246    /// `load_operand` looked up a value in `val_to_reg`.
247    pub load_hits: usize,
248    /// The allocated register had been invalidated and had to be reloaded from stack.
249    pub load_reloads: usize,
250    /// `load_operand` emitted `MoveReg` from allocated register to target temp register.
251    pub load_moves: usize,
252    /// `store_to_slot` wrote a value into an allocated destination register.
253    pub store_hits: usize,
254    /// `store_to_slot` needed a `MoveReg` to copy source into the allocated register.
255    pub store_moves: usize,
256}
257
258impl CacheSnapshot {
259    /// Invalidate a register's cache entries in this snapshot.
260    /// Used to remove entries for registers that a terminator may clobber.
261    pub fn invalidate_reg(&mut self, reg: u8) {
262        let idx = reg as usize;
263        if let Some(slot) = self.reg_to_slot[idx].take() {
264            self.slot_cache.remove(&slot);
265        }
266        self.reg_to_const[idx] = None;
267        self.invalidate_alloc_reg(reg);
268    }
269
270    /// Invalidate allocated-register slot ownership in this snapshot.
271    pub fn invalidate_alloc_reg(&mut self, reg: u8) {
272        let idx = reg as usize;
273        self.alloc_reg_slot[idx] = None;
274        self.alloc_dirty[idx] = false;
275    }
276}
277
278/// Deferred `ICmp` info for branch fusion.
279pub struct FusedIcmp<'ctx> {
280    pub predicate: IntPredicate,
281    pub lhs: BasicValueEnum<'ctx>,
282    pub rhs: BasicValueEnum<'ctx>,
283}
284
285/// Wrapper key for LLVM values in the slot map.
286/// Uses the raw LLVM value pointer cast to usize for hashing.
287#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
288pub struct ValKey(pub(crate) usize);
289
290pub fn val_key_int(val: IntValue<'_>) -> ValKey {
291    ValKey(val.as_value_ref() as usize)
292}
293
294pub fn val_key_basic(val: BasicValueEnum<'_>) -> ValKey {
295    ValKey(val.as_value_ref() as usize)
296}
297
298pub fn val_key_instr(val: InstructionValue<'_>) -> ValKey {
299    ValKey(val.as_value_ref() as usize)
300}
301
302impl<'ctx> PvmEmitter<'ctx> {
303    pub fn new(config: EmitterConfig, call_return_base: usize) -> Self {
304        Self {
305            config,
306            instructions: Vec::new(),
307            labels: Vec::new(),
308            fixups: Vec::new(),
309            block_labels: HashMap::new(),
310            value_slots: HashMap::new(),
311            next_slot_offset: FRAME_HEADER_SIZE,
312            frame_size: 0,
313            call_fixups: Vec::new(),
314            indirect_call_fixups: Vec::new(),
315            byte_offset: 0,
316            slot_cache: HashMap::new(),
317            reg_to_slot: [None; 13],
318            reg_to_const: [None; 13],
319            pending_fused_icmp: None,
320            block_single_pred: HashMap::new(),
321            next_call_return_idx: call_return_base,
322            call_return_base_idx: call_return_base,
323            used_callee_regs: [true; 4],
324            callee_save_offsets: [Some(8), Some(16), Some(24), Some(32)],
325            has_calls: true, // conservative default
326            regalloc: RegAllocResult::default(),
327            alloc_reg_slot: [None; 13],
328            alloc_dirty: [false; 13],
329            regalloc_usage: RegAllocUsageStats::default(),
330            next_block_label: None,
331        }
332    }
333
334    /// Allocate a call return address (jump table address) for a direct call site.
335    /// Returns the pre-computed jump table address `(index + 1) * 2`.
336    pub fn alloc_call_return_addr(&mut self) -> i32 {
337        let idx = self.next_call_return_idx;
338        self.next_call_return_idx += 1;
339        (idx + 1)
340            .checked_mul(2)
341            .and_then(|v| i32::try_from(v).ok())
342            .expect("alloc_call_return_addr: jump table index overflow (exceeds i32 range)")
343    }
344
345    /// Returns how many call return addresses this function allocated.
346    pub fn num_call_returns(&self) -> usize {
347        self.next_call_return_idx - self.call_return_base_idx
348    }
349
350    pub fn alloc_label(&mut self) -> usize {
351        let id = self.labels.len();
352        self.labels.push(None);
353        id
354    }
355
356    pub fn define_label(&mut self, label: usize) {
357        if self
358            .instructions
359            .last()
360            .is_some_and(|last| !last.is_terminating())
361        {
362            self.emit(Instruction::Fallthrough);
363        }
364        self.labels[label] = Some(self.current_offset());
365        self.clear_reg_cache();
366    }
367
368    /// Define a label, clearing the general cache but preserving allocated register state.
369    ///
370    /// Used for leaf functions where allocated registers are never clobbered.
371    pub fn define_label_preserving_alloc(&mut self, label: usize) {
372        if self
373            .instructions
374            .last()
375            .is_some_and(|last| !last.is_terminating())
376        {
377            self.emit(Instruction::Fallthrough);
378        }
379        self.labels[label] = Some(self.current_offset());
380        self.clear_reg_cache_preserving_alloc();
381    }
382
383    pub fn current_offset(&self) -> usize {
384        self.byte_offset
385    }
386
387    pub fn emit(&mut self, instr: Instruction) {
388        // Constant propagation: skip LoadImm/LoadImm64 if register already holds the value.
389        if self.config.constant_propagation_enabled {
390            match &instr {
391                Instruction::LoadImm { reg, value } => {
392                    // LoadImm sign-extends i32 → i64, so the 64-bit value is the sign extension.
393                    let val64 = i64::from(*value) as u64;
394                    if self.reg_to_const[*reg as usize] == Some(val64) {
395                        return; // Already holds this constant.
396                    }
397                }
398                Instruction::LoadImm64 { reg, value } => {
399                    if self.reg_to_const[*reg as usize] == Some(*value) {
400                        return; // Already holds this constant.
401                    }
402                }
403                _ => {}
404            }
405        }
406
407        if let Some(reg) = instr.dest_reg() {
408            self.invalidate_reg(reg);
409        }
410
411        // Track constants after emit.
412        if self.config.constant_propagation_enabled {
413            match &instr {
414                Instruction::LoadImm { reg, value } => {
415                    self.reg_to_const[*reg as usize] = Some(i64::from(*value) as u64);
416                }
417                Instruction::LoadImm64 { reg, value } => {
418                    self.reg_to_const[*reg as usize] = Some(*value);
419                }
420                _ => {}
421            }
422        }
423
424        self.byte_offset += instr.encode().len();
425        self.instructions.push(instr);
426    }
427
428    pub fn emit_jump_to_label(&mut self, label: usize) {
429        // If jumping to the next block in layout order, skip the Jump — execution
430        // will fall through naturally. define_label() emits a Fallthrough if needed.
431        if self.config.fallthrough_jumps_enabled && self.next_block_label == Some(label) {
432            return;
433        }
434        let fixup_idx = self.instructions.len();
435        self.fixups.push((fixup_idx, label));
436        self.emit(Instruction::Jump { offset: 0 });
437    }
438
439    pub fn emit_branch_ne_imm_to_label(&mut self, reg: u8, value: i32, label: usize) {
440        let fixup_idx = self.instructions.len();
441        self.fixups.push((fixup_idx, label));
442        self.emit(Instruction::BranchNeImm {
443            reg,
444            value,
445            offset: 0,
446        });
447    }
448
449    pub fn emit_branch_eq_imm_to_label(&mut self, reg: u8, value: i32, label: usize) {
450        let fixup_idx = self.instructions.len();
451        self.fixups.push((fixup_idx, label));
452        self.emit(Instruction::BranchEqImm {
453            reg,
454            value,
455            offset: 0,
456        });
457    }
458
459    pub fn emit_branch_lt_u_to_label(&mut self, reg1: u8, reg2: u8, label: usize) {
460        let fixup_idx = self.instructions.len();
461        self.fixups.push((fixup_idx, label));
462        self.emit(Instruction::BranchLtU {
463            reg1,
464            reg2,
465            offset: 0,
466        });
467    }
468
469    pub fn emit_branch_ge_u_to_label(&mut self, reg1: u8, reg2: u8, label: usize) {
470        let fixup_idx = self.instructions.len();
471        self.fixups.push((fixup_idx, label));
472        self.emit(Instruction::BranchGeU {
473            reg1,
474            reg2,
475            offset: 0,
476        });
477    }
478
479    pub fn emit_branch_eq_to_label(&mut self, reg1: u8, reg2: u8, label: usize) {
480        let fixup_idx = self.instructions.len();
481        self.fixups.push((fixup_idx, label));
482        self.emit(Instruction::BranchEq {
483            reg1,
484            reg2,
485            offset: 0,
486        });
487    }
488
489    pub fn emit_branch_ne_to_label(&mut self, reg1: u8, reg2: u8, label: usize) {
490        let fixup_idx = self.instructions.len();
491        self.fixups.push((fixup_idx, label));
492        self.emit(Instruction::BranchNe {
493            reg1,
494            reg2,
495            offset: 0,
496        });
497    }
498
499    pub fn emit_branch_lt_s_to_label(&mut self, reg1: u8, reg2: u8, label: usize) {
500        let fixup_idx = self.instructions.len();
501        self.fixups.push((fixup_idx, label));
502        self.emit(Instruction::BranchLtS {
503            reg1,
504            reg2,
505            offset: 0,
506        });
507    }
508
509    pub fn emit_branch_ge_s_to_label(&mut self, reg1: u8, reg2: u8, label: usize) {
510        let fixup_idx = self.instructions.len();
511        self.fixups.push((fixup_idx, label));
512        self.emit(Instruction::BranchGeS {
513            reg1,
514            reg2,
515            offset: 0,
516        });
517    }
518
519    pub fn emit_branch_lt_u_imm_to_label(&mut self, reg: u8, value: i32, label: usize) {
520        let fixup_idx = self.instructions.len();
521        self.fixups.push((fixup_idx, label));
522        self.emit(Instruction::BranchLtUImm {
523            reg,
524            value,
525            offset: 0,
526        });
527    }
528
529    pub fn emit_branch_le_u_imm_to_label(&mut self, reg: u8, value: i32, label: usize) {
530        let fixup_idx = self.instructions.len();
531        self.fixups.push((fixup_idx, label));
532        self.emit(Instruction::BranchLeUImm {
533            reg,
534            value,
535            offset: 0,
536        });
537    }
538
539    pub fn emit_branch_ge_u_imm_to_label(&mut self, reg: u8, value: i32, label: usize) {
540        let fixup_idx = self.instructions.len();
541        self.fixups.push((fixup_idx, label));
542        self.emit(Instruction::BranchGeUImm {
543            reg,
544            value,
545            offset: 0,
546        });
547    }
548
549    pub fn emit_branch_gt_u_imm_to_label(&mut self, reg: u8, value: i32, label: usize) {
550        let fixup_idx = self.instructions.len();
551        self.fixups.push((fixup_idx, label));
552        self.emit(Instruction::BranchGtUImm {
553            reg,
554            value,
555            offset: 0,
556        });
557    }
558
559    pub fn emit_branch_lt_s_imm_to_label(&mut self, reg: u8, value: i32, label: usize) {
560        let fixup_idx = self.instructions.len();
561        self.fixups.push((fixup_idx, label));
562        self.emit(Instruction::BranchLtSImm {
563            reg,
564            value,
565            offset: 0,
566        });
567    }
568
569    pub fn emit_branch_le_s_imm_to_label(&mut self, reg: u8, value: i32, label: usize) {
570        let fixup_idx = self.instructions.len();
571        self.fixups.push((fixup_idx, label));
572        self.emit(Instruction::BranchLeSImm {
573            reg,
574            value,
575            offset: 0,
576        });
577    }
578
579    pub fn emit_branch_ge_s_imm_to_label(&mut self, reg: u8, value: i32, label: usize) {
580        let fixup_idx = self.instructions.len();
581        self.fixups.push((fixup_idx, label));
582        self.emit(Instruction::BranchGeSImm {
583            reg,
584            value,
585            offset: 0,
586        });
587    }
588
589    pub fn emit_branch_gt_s_imm_to_label(&mut self, reg: u8, value: i32, label: usize) {
590        let fixup_idx = self.instructions.len();
591        self.fixups.push((fixup_idx, label));
592        self.emit(Instruction::BranchGtSImm {
593            reg,
594            value,
595            offset: 0,
596        });
597    }
598
599    // ── Slot allocation ──
600
601    pub fn alloc_slot_for_key(&mut self, key: ValKey) -> i32 {
602        let offset = self.next_slot_offset;
603        self.value_slots.insert(key, offset);
604        self.next_slot_offset += 8;
605        offset
606    }
607
608    pub fn get_slot(&self, key: ValKey) -> Option<i32> {
609        self.value_slots.get(&key).copied()
610    }
611
612    // ── Value load / store ──
613
614    /// Emit a constant load into `reg` (prefer compact `LoadImm` when the value fits i32).
615    /// Constant propagation in `emit()` will skip if the register already holds this value.
616    fn emit_const_to_reg(&mut self, reg: u8, value: u64) {
617        let signed = value as i64;
618        if let Ok(v32) = i32::try_from(signed) {
619            self.emit(Instruction::LoadImm { reg, value: v32 });
620        } else {
621            self.emit(Instruction::LoadImm64 { reg, value });
622        }
623    }
624
625    /// Load a value into a temp register. Constants are inlined; SSA values are loaded from slots.
626    /// Poison/undef values are materialized as 0 (any value is valid for undefined behavior).
627    /// Returns an error for truly unknown values.
628    pub fn load_operand(&mut self, val: BasicValueEnum<'ctx>, temp_reg: u8) -> Result<()> {
629        match val {
630            BasicValueEnum::IntValue(iv) => {
631                let key = val_key_int(iv);
632                if let Some(signed_val) = iv.get_sign_extended_constant() {
633                    self.emit_const_to_reg(temp_reg, signed_val as u64);
634                } else if let Some(const_val) = iv.get_zero_extended_constant() {
635                    // Fallback for unsigned constants.
636                    self.emit_const_to_reg(temp_reg, const_val);
637                } else if let Some(&alloc_reg) = self.regalloc.val_to_reg.get(&key) {
638                    self.regalloc_usage.load_hits += 1;
639                    let slot = self.get_slot(key).ok_or_else(|| {
640                        Error::Internal(format!("no slot for allocated int value {key:?}"))
641                    })?;
642                    // Value has an allocated register. If it was clobbered OR this
643                    // register currently materializes a different slot, reload
644                    // lazily from the canonical stack slot.
645                    let reg_idx = alloc_reg as usize;
646                    if self.alloc_reg_slot[reg_idx] == Some(slot) {
647                        // Fast path: register holds the correct value.
648                        if alloc_reg != temp_reg {
649                            self.regalloc_usage.load_moves += 1;
650                            self.emit(Instruction::MoveReg {
651                                dst: temp_reg,
652                                src: alloc_reg,
653                            });
654                        }
655                    } else if alloc_reg == temp_reg {
656                        // Reload directly into the target (which is the alloc reg).
657                        self.regalloc_usage.load_reloads += 1;
658                        self.emit(Instruction::LoadIndU64 {
659                            dst: alloc_reg,
660                            base: STACK_PTR_REG,
661                            offset: slot,
662                        });
663                        self.alloc_reg_slot[reg_idx] = Some(slot);
664                        self.cache_slot(slot, alloc_reg);
665                    } else {
666                        // Allocated register was invalidated and target is different.
667                        // Load directly into temp_reg to avoid clobbering alloc_reg
668                        // (which may hold a call argument or other transient value).
669                        self.regalloc_usage.load_reloads += 1;
670                        self.emit(Instruction::LoadIndU64 {
671                            dst: temp_reg,
672                            base: STACK_PTR_REG,
673                            offset: slot,
674                        });
675                        self.cache_slot(slot, temp_reg);
676                    }
677                } else if let Some(slot) = self.get_slot(key) {
678                    // Check register cache: skip load if value is already in a register.
679                    if let Some(&cached_reg) = self.slot_cache.get(&slot) {
680                        if cached_reg != temp_reg {
681                            // Emit a register copy (cheaper than memory load).
682                            self.emit(Instruction::MoveReg {
683                                dst: temp_reg,
684                                src: cached_reg,
685                            });
686                        }
687                        // If cached_reg == temp_reg, skip entirely (0 instructions).
688                    } else {
689                        self.emit(Instruction::LoadIndU64 {
690                            dst: temp_reg,
691                            base: STACK_PTR_REG,
692                            offset: slot,
693                        });
694                        self.cache_slot(slot, temp_reg);
695                    }
696                } else if iv.is_poison() || iv.is_undef() {
697                    // Poison/undef values can be materialized as any value; use 0.
698                    self.emit(Instruction::LoadImm {
699                        reg: temp_reg,
700                        value: 0,
701                    });
702                } else {
703                    return Err(Error::Internal(format!(
704                        "no slot for int value {:?} (opcode: {:?})",
705                        val_key_int(iv),
706                        iv.as_instruction().map(InstructionValue::get_opcode),
707                    )));
708                }
709            }
710            _ => {
711                return Err(Error::Internal(format!(
712                    "cannot load non-integer value type {:?}",
713                    val.get_type()
714                )));
715            }
716        }
717        Ok(())
718    }
719
720    pub fn store_to_slot(&mut self, slot_offset: i32, src_reg: u8) {
721        let alloc_reg = self.regalloc.slot_to_reg.get(&slot_offset).copied();
722        if let Some(alloc_reg) = alloc_reg {
723            // If the register currently holds a DIFFERENT slot's dirty value,
724            // spill it first. This happens when multiple values share a register
725            // via early interval expiration (loop phi register reuse).
726            let reg_idx = alloc_reg as usize;
727            if let Some(current_slot) = self.alloc_reg_slot[reg_idx]
728                && current_slot != slot_offset
729                && self.alloc_dirty[reg_idx]
730            {
731                self.spill_dirty_reg(alloc_reg);
732            }
733
734            self.regalloc_usage.store_hits += 1;
735            if src_reg != alloc_reg {
736                self.regalloc_usage.store_moves += 1;
737                self.emit(Instruction::MoveReg {
738                    dst: alloc_reg,
739                    src: src_reg,
740                });
741            }
742            self.alloc_reg_slot[reg_idx] = Some(slot_offset);
743
744            if self.config.lazy_spill_enabled {
745                // Lazy spill: mark dirty, skip the stack store.
746                self.alloc_dirty[reg_idx] = true;
747                self.cache_slot(slot_offset, alloc_reg);
748                return;
749            }
750        }
751        self.emit(Instruction::StoreIndU64 {
752            base: STACK_PTR_REG,
753            src: src_reg,
754            offset: slot_offset,
755        });
756        self.cache_slot(slot_offset, alloc_reg.unwrap_or(src_reg));
757    }
758
759    // ── Register allocation spill/reload ──
760
761    /// Flush a single dirty allocated register to its stack slot.
762    ///
763    /// Emits `StoreIndU64` directly (bypassing `emit()` to avoid `invalidate_reg`
764    /// clearing `alloc_reg_slot`), then marks the register clean.
765    fn spill_dirty_reg(&mut self, reg: u8) {
766        let reg_idx = reg as usize;
767        if !self.alloc_dirty[reg_idx] {
768            return;
769        }
770        if let Some(slot) = self.alloc_reg_slot[reg_idx] {
771            let instr = Instruction::StoreIndU64 {
772                base: STACK_PTR_REG,
773                src: reg,
774                offset: slot,
775            };
776            self.byte_offset += instr.encode().len();
777            self.instructions.push(instr);
778        }
779        self.alloc_dirty[reg_idx] = false;
780    }
781
782    /// Flush ALL dirty allocated registers to their stack slots.
783    pub fn spill_all_dirty_regs(&mut self) {
784        let regs: Vec<u8> = self.regalloc.reg_to_slot.keys().copied().collect();
785        for reg in regs {
786            self.spill_dirty_reg(reg);
787        }
788    }
789
790    /// Set `alloc_reg_slot` for a specific register to a given slot and mark dirty.
791    ///
792    /// Used after register-aware phi copies to tell the emitter that the
793    /// allocated register now holds the phi destination value.
794    /// Must be called AFTER `emit(MoveReg)` since `emit()` triggers `invalidate_reg()`.
795    pub fn set_alloc_reg_for_slot(&mut self, reg: u8, slot: i32) {
796        let idx = reg as usize;
797        self.alloc_reg_slot[idx] = Some(slot);
798        self.alloc_dirty[idx] = true;
799        self.cache_slot(slot, reg);
800    }
801
802    /// Check if a register currently owns its expected allocated slot.
803    pub fn is_alloc_reg_valid(&self, reg: u8, slot: i32) -> bool {
804        self.alloc_reg_slot[reg as usize] == Some(slot)
805    }
806
807    /// Emit a raw `MoveReg` bypassing `emit()` (no `invalidate_reg` on dst).
808    /// Used for register-aware phi copies where we manage alloc state manually.
809    pub fn emit_raw_move(&mut self, dst: u8, src: u8) {
810        let instr = Instruction::MoveReg { dst, src };
811        self.byte_offset += instr.encode().len();
812        self.instructions.push(instr);
813        // Clear stale constant cache for dst — the register no longer holds
814        // a known constant after the move.
815        self.reg_to_const[dst as usize] = None;
816    }
817
818    /// Public wrapper for spilling a single dirty register.
819    /// Used by the parallel move resolver in phi copies.
820    pub fn spill_dirty_reg_pub(&mut self, reg: u8) {
821        self.spill_dirty_reg(reg);
822    }
823
824    /// Spill register-allocated values before instructions that clobber them.
825    /// With lazy spill, flushes dirty registers to stack.
826    /// Without lazy spill (write-through), this is a no-op.
827    pub fn spill_allocated_regs(&mut self) {
828        if self.config.lazy_spill_enabled {
829            self.spill_all_dirty_regs();
830        }
831    }
832
833    fn invalidate_allocated_regs_where(&mut self, mut pred: impl FnMut(u8) -> bool) {
834        let regs: Vec<u8> = self
835            .regalloc
836            .reg_to_slot
837            .keys()
838            .copied()
839            .filter(|&r| pred(r))
840            .collect();
841        for reg in regs {
842            self.invalidate_reg(reg);
843        }
844    }
845
846    /// Invalidate allocated registers clobbered by scratch-heavy helpers.
847    /// This is used after intrinsics that overwrite r5/r6.
848    pub fn reload_allocated_regs_after_scratch_clobber(&mut self) {
849        self.invalidate_allocated_regs_where(|r| {
850            r == crate::abi::SCRATCH1 || r == crate::abi::SCRATCH2
851        });
852    }
853
854    /// Invalidate allocated registers clobbered by a function call.
855    ///
856    /// Call argument setup may overwrite local argument registers (r9+), so
857    /// allocated local-register mappings are invalidated after each call.
858    /// This blanket variant invalidates all r9-r12; prefer
859    /// `reload_allocated_regs_after_call_with_arity` when the call's argument
860    /// count is known.
861    pub fn reload_allocated_regs_after_call(&mut self) {
862        self.reload_allocated_regs_after_call_with_arity(crate::abi::MAX_LOCAL_REGS);
863    }
864
865    /// Invalidate only the allocated registers actually clobbered by a call
866    /// with `num_args` arguments. Registers r9..r9+min(num_args,4)-1 are
867    /// clobbered by argument setup; higher registers remain valid.
868    /// r5/r6 (scratch) and r7/r8 (caller-saved) are always clobbered by calls.
869    /// Note: `clear_reg_cache()` already clears all alloc state before this is
870    /// called, so this is largely a documentation/safety net.
871    pub fn reload_allocated_regs_after_call_with_arity(&mut self, num_args: usize) {
872        let clobbered_locals = num_args.min(crate::abi::MAX_LOCAL_REGS);
873        self.invalidate_allocated_regs_where(|r| {
874            r == crate::abi::SCRATCH1
875                || r == crate::abi::SCRATCH2
876                || r == crate::abi::RETURN_VALUE_REG
877                || r == crate::abi::ARGS_LEN_REG
878                || (r >= crate::abi::FIRST_LOCAL_REG
879                    && r < crate::abi::FIRST_LOCAL_REG + clobbered_locals as u8)
880        });
881    }
882
883    // ── Register cache ──
884
885    /// Record that `reg` now holds the value of `slot`.
886    pub fn cache_slot(&mut self, slot: i32, reg: u8) {
887        if !self.config.register_cache_enabled {
888            return;
889        }
890        // Remove any previous slot cached in this register.
891        if let Some(old_slot) = self.reg_to_slot[reg as usize].take() {
892            self.slot_cache.remove(&old_slot);
893        }
894        // Remove any previous register caching this slot.
895        if let Some(old_reg) = self.slot_cache.insert(slot, reg) {
896            self.reg_to_slot[old_reg as usize] = None;
897        }
898        self.reg_to_slot[reg as usize] = Some(slot);
899    }
900
901    /// Invalidate a register's cache entry (called when the register is overwritten).
902    /// With lazy spill, if the register holds a dirty value, flush it to the
903    /// stack before losing it.
904    fn invalidate_reg(&mut self, reg: u8) {
905        let idx = reg as usize;
906        if let Some(slot) = self.reg_to_slot[idx].take() {
907            self.slot_cache.remove(&slot);
908        }
909        if self.regalloc.reg_to_slot.contains_key(&reg) {
910            // Lazy spill: if the register is dirty, spill to stack before clearing.
911            if self.config.lazy_spill_enabled
912                && self.alloc_dirty[idx]
913                && let Some(slot) = self.alloc_reg_slot[idx]
914            {
915                let instr = Instruction::StoreIndU64 {
916                    base: STACK_PTR_REG,
917                    src: reg,
918                    offset: slot,
919                };
920                self.byte_offset += instr.encode().len();
921                self.instructions.push(instr);
922            }
923            self.alloc_reg_slot[idx] = None;
924            self.alloc_dirty[idx] = false;
925        }
926        self.reg_to_const[idx] = None;
927    }
928
929    /// Clear the entire register cache (at block boundaries).
930    ///
931    /// Clears the general cache (`slot_cache`, `reg_to_slot`, `reg_to_const`) and
932    /// allocated register state (`alloc_reg_slot`, `alloc_dirty`).
933    pub fn clear_reg_cache(&mut self) {
934        self.slot_cache.clear();
935        self.reg_to_slot = [None; 13];
936        self.reg_to_const = [None; 13];
937        self.clear_allocated_reg_state();
938    }
939
940    /// Clear the general register cache but preserve allocated register state.
941    ///
942    /// Used at block boundaries in leaf functions where allocated registers
943    /// are never clobbered by calls. With lazy spill, dirty values are preserved
944    /// (they remain valid in registers across blocks in leaf functions).
945    pub fn clear_reg_cache_preserving_alloc(&mut self) {
946        self.slot_cache.clear();
947        self.reg_to_slot = [None; 13];
948        self.reg_to_const = [None; 13];
949        // alloc_reg_slot and alloc_dirty intentionally NOT cleared for leaf functions.
950    }
951
952    fn clear_allocated_reg_state(&mut self) {
953        for &reg in self.regalloc.reg_to_slot.keys() {
954            let idx = reg as usize;
955            self.alloc_reg_slot[idx] = None;
956            self.alloc_dirty[idx] = false;
957        }
958    }
959
960    /// Set allocated register state from a snapshot's `alloc_reg_slot`.
961    /// Only sets entries where the snapshot has Some — does not clear others.
962    pub fn set_alloc_reg_slot_from(&mut self, alloc_reg_slot: &[Option<i32>; 13]) {
963        for &reg in self.regalloc.reg_to_slot.keys() {
964            self.alloc_reg_slot[reg as usize] = alloc_reg_slot[reg as usize];
965        }
966    }
967
968    /// Set allocated register state from a snapshot, but only for registers
969    /// matching a filter predicate. Used for back-edge propagation where only
970    /// callee-saved registers not clobbered by calls are safe to propagate.
971    pub fn set_alloc_reg_slot_filtered(
972        &mut self,
973        alloc_reg_slot: &[Option<i32>; 13],
974        filter: impl Fn(u8) -> bool,
975    ) {
976        for &reg in self.regalloc.reg_to_slot.keys() {
977            if filter(reg) {
978                self.alloc_reg_slot[reg as usize] = alloc_reg_slot[reg as usize];
979            }
980        }
981    }
982
983    /// Intersect allocated register state with a snapshot.
984    /// Only keeps entries where both the current state and the snapshot agree.
985    pub fn intersect_alloc_reg_slot(&mut self, other: &[Option<i32>; 13]) {
986        for &reg in self.regalloc.reg_to_slot.keys() {
987            let idx = reg as usize;
988            if self.alloc_reg_slot[idx] != other[idx] {
989                self.alloc_reg_slot[idx] = None;
990            }
991        }
992    }
993
994    /// Take a snapshot of the current register cache state.
995    pub fn snapshot_cache(&self) -> CacheSnapshot {
996        CacheSnapshot {
997            slot_cache: self.slot_cache.clone(),
998            reg_to_slot: self.reg_to_slot,
999            reg_to_const: self.reg_to_const,
1000            alloc_reg_slot: self.alloc_reg_slot,
1001            alloc_dirty: self.alloc_dirty,
1002        }
1003    }
1004
1005    /// Restore register cache state from a snapshot.
1006    pub fn restore_cache(&mut self, snapshot: &CacheSnapshot) {
1007        self.slot_cache.clone_from(&snapshot.slot_cache);
1008        self.reg_to_slot = snapshot.reg_to_slot;
1009        self.reg_to_const = snapshot.reg_to_const;
1010        self.alloc_reg_slot = snapshot.alloc_reg_slot;
1011        self.alloc_dirty = snapshot.alloc_dirty;
1012    }
1013
1014    /// Define a label without clearing the register cache.
1015    /// Used for cross-block cache propagation when the cache will be restored from a snapshot.
1016    pub fn define_label_preserving_cache(&mut self, label: usize) {
1017        if self
1018            .instructions
1019            .last()
1020            .is_some_and(|last| !last.is_terminating())
1021        {
1022            self.emit(Instruction::Fallthrough);
1023        }
1024        self.labels[label] = Some(self.current_offset());
1025    }
1026
1027    // ── Fixup resolution ──
1028
1029    pub fn resolve_fixups(&mut self) -> Result<()> {
1030        // Precompute byte offsets for each instruction to avoid O(n²) re-scanning.
1031        let mut offsets = Vec::with_capacity(self.instructions.len());
1032        let mut running = 0usize;
1033        for instr in &self.instructions {
1034            offsets.push(running);
1035            running += instr.encode().len();
1036        }
1037
1038        for &(instr_idx, label_id) in &self.fixups {
1039            let target_offset = self.labels[label_id]
1040                .ok_or_else(|| Error::Unsupported("unresolved label".to_string()))?;
1041
1042            // PVM jump offsets are relative to the instruction start.
1043            let instr_start = offsets[instr_idx];
1044            let relative_offset = (target_offset as i32) - (instr_start as i32);
1045
1046            match &mut self.instructions[instr_idx] {
1047                Instruction::Jump { offset }
1048                | Instruction::LoadImmJump { offset, .. }
1049                | Instruction::BranchNeImm { offset, .. }
1050                | Instruction::BranchEqImm { offset, .. }
1051                | Instruction::BranchGeSImm { offset, .. }
1052                | Instruction::BranchLtUImm { offset, .. }
1053                | Instruction::BranchLeUImm { offset, .. }
1054                | Instruction::BranchGeUImm { offset, .. }
1055                | Instruction::BranchGtUImm { offset, .. }
1056                | Instruction::BranchLtSImm { offset, .. }
1057                | Instruction::BranchLeSImm { offset, .. }
1058                | Instruction::BranchGtSImm { offset, .. }
1059                | Instruction::BranchEq { offset, .. }
1060                | Instruction::BranchNe { offset, .. }
1061                | Instruction::BranchGeU { offset, .. }
1062                | Instruction::BranchLtU { offset, .. }
1063                | Instruction::BranchLtS { offset, .. }
1064                | Instruction::BranchGeS { offset, .. } => {
1065                    *offset = relative_offset;
1066                }
1067                _ => {
1068                    return Err(Error::Unsupported(
1069                        "cannot fixup non-jump instruction".to_string(),
1070                    ));
1071                }
1072            }
1073        }
1074        Ok(())
1075    }
1076}
1077
1078/// Try to extract a constant integer value from a `BasicValueEnum` without emitting instructions.
1079/// Returns `Some(i64)` for compile-time constants, `None` for SSA values.
1080pub fn try_get_constant(val: BasicValueEnum<'_>) -> Option<i64> {
1081    if let BasicValueEnum::IntValue(iv) = val {
1082        iv.get_sign_extended_constant()
1083    } else {
1084        None
1085    }
1086}
1087
1088/// Get the i-th operand of an instruction as a `BasicValueEnum`.
1089pub fn get_operand(instr: InstructionValue<'_>, i: u32) -> Result<BasicValueEnum<'_>> {
1090    instr
1091        .get_operand(i)
1092        .and_then(inkwell::values::Operand::value)
1093        .ok_or_else(|| Error::Internal(format!("missing operand {i} in {:?}", instr.get_opcode())))
1094}
1095
1096/// Get the i-th operand of an instruction as a `BasicBlock`.
1097pub fn get_bb_operand(instr: InstructionValue<'_>, i: u32) -> Result<BasicBlock<'_>> {
1098    instr
1099        .get_operand(i)
1100        .and_then(inkwell::values::Operand::block)
1101        .ok_or_else(|| {
1102            Error::Internal(format!(
1103                "missing bb operand {i} in {:?}",
1104                instr.get_opcode()
1105            ))
1106        })
1107}
1108
1109/// Get the slot offset for an instruction's result.
1110pub fn result_slot(e: &PvmEmitter<'_>, instr: InstructionValue<'_>) -> Result<i32> {
1111    let key = val_key_instr(instr);
1112    e.get_slot(key)
1113        .ok_or_else(|| Error::Internal(format!("no slot for {:?} result", instr.get_opcode())))
1114}
1115
1116/// Get the register to use for this instruction's result.
1117/// Returns the allocated register (for store-side coalescing) or `TEMP_RESULT` as fallback.
1118/// When the result has an allocated register, computing directly into it avoids a
1119/// subsequent `MoveReg` in `store_to_slot`.
1120pub fn result_reg(e: &PvmEmitter<'_>, instr: InstructionValue<'_>) -> u8 {
1121    result_reg_or(e, instr, crate::abi::TEMP_RESULT)
1122}
1123
1124/// Like `result_reg` but with a custom fallback register.
1125/// Used by lowering paths that naturally use a different working register
1126/// (e.g., zext/sext/trunc use `TEMP1` instead of `TEMP_RESULT`).
1127pub fn result_reg_or(e: &PvmEmitter<'_>, instr: InstructionValue<'_>, fallback: u8) -> u8 {
1128    let key = val_key_instr(instr);
1129    if let Some(&alloc_reg) = e.regalloc.val_to_reg.get(&key) {
1130        alloc_reg
1131    } else {
1132        fallback
1133    }
1134}
1135
1136/// Get the allocated register for a value if it's currently valid, otherwise return fallback.
1137/// Used for load-side coalescing: callers can use the allocated register directly as an
1138/// instruction operand instead of copying to a temp register via `load_operand`.
1139pub fn operand_reg(e: &PvmEmitter<'_>, val: BasicValueEnum<'_>, fallback: u8) -> u8 {
1140    if let BasicValueEnum::IntValue(iv) = val {
1141        // Constants don't have allocated registers.
1142        if iv.get_sign_extended_constant().is_some() || iv.get_zero_extended_constant().is_some() {
1143            return fallback;
1144        }
1145        if iv.is_poison() || iv.is_undef() {
1146            return fallback;
1147        }
1148        let key = val_key_int(iv);
1149        if let Some(&alloc_reg) = e.regalloc.val_to_reg.get(&key)
1150            && let Some(slot) = e.get_slot(key)
1151            && e.alloc_reg_slot[alloc_reg as usize] == Some(slot)
1152        {
1153            return alloc_reg;
1154        }
1155    }
1156    fallback
1157}
1158
1159/// Detect the bit width of an instruction's **result** type.
1160///
1161/// For most instructions (binary ops, comparisons), this is the correct width
1162/// to use for choosing 32-bit vs 64-bit PVM instructions.
1163///
1164/// **Warning**: For conversion instructions (`SExt`, `ZExt`, `Trunc`), the result
1165/// type differs from the source type. Use [`source_bit_width`] instead when
1166/// you need the source operand's width (e.g., to determine which sign extension
1167/// to emit).
1168pub fn operand_bit_width(instr: InstructionValue<'_>) -> u32 {
1169    // Prefer the instruction's result type.
1170    if let inkwell::types::AnyTypeEnum::IntType(ty) = instr.get_type() {
1171        return ty.get_bit_width();
1172    }
1173    // Fallback: check the first operand's type.
1174    if let Some(op) = instr
1175        .get_operand(0)
1176        .and_then(inkwell::values::Operand::value)
1177        && let BasicValueEnum::IntValue(iv) = op
1178    {
1179        return iv.get_type().get_bit_width();
1180    }
1181    64 // default
1182}
1183
1184/// Detect the bit width of an instruction's **source** (first operand) type.
1185///
1186/// This is the correct function for conversion instructions (`SExt`, `ZExt`, `Trunc`)
1187/// where you need to know what width the value is being converted *from*.
1188pub fn source_bit_width(instr: InstructionValue<'_>) -> u32 {
1189    if let Some(op) = instr
1190        .get_operand(0)
1191        .and_then(inkwell::values::Operand::value)
1192        && let BasicValueEnum::IntValue(iv) = op
1193    {
1194        return iv.get_type().get_bit_width();
1195    }
1196    64 // default
1197}
1198
1199/// Check whether `target_bb` has any phi nodes with incomings from `current_bb`.
1200pub fn has_phi_from<'ctx>(current_bb: BasicBlock<'ctx>, target_bb: BasicBlock<'ctx>) -> bool {
1201    for instr in target_bb.get_instructions() {
1202        use inkwell::values::InstructionOpcode;
1203        if instr.get_opcode() != InstructionOpcode::Phi {
1204            break;
1205        }
1206        let Ok(phi): std::result::Result<PhiValue<'ctx>, _> = instr.try_into() else {
1207            break;
1208        };
1209        let num_incomings = phi.count_incoming();
1210        for i in 0..num_incomings {
1211            if let Some((_, block)) = phi.get_incoming(i)
1212                && block == current_bb
1213            {
1214                return true;
1215            }
1216        }
1217    }
1218    false
1219}
1220
1221/// Check whether a basic block starts with any phi nodes.
1222pub fn block_has_phis(bb: BasicBlock<'_>) -> bool {
1223    bb.get_first_instruction()
1224        .is_some_and(|instr| instr.get_opcode() == inkwell::values::InstructionOpcode::Phi)
1225}
1226
1227/// Pre-scan function to allocate labels and slots.
1228///
1229/// Also determines which callee-saved registers are actually used (for shrink wrapping).
1230pub fn pre_scan_function<'ctx>(
1231    emitter: &mut PvmEmitter<'ctx>,
1232    function: FunctionValue<'ctx>,
1233    is_main: bool,
1234) {
1235    // Detect real calls to determine if this is a leaf function.
1236    // PVM intrinsics (__pvm_load_*, __pvm_store_*, etc.) and LLVM intrinsics
1237    // (llvm.*) are NOT real function calls — they don't use the calling
1238    // convention and don't clobber callee-saved registers (r9-r12).
1239    // Only wasm_func_* and __pvm_call_indirect are real calls.
1240    let mut has_calls = false;
1241    for bb in function.get_basic_blocks() {
1242        for instr in bb.get_instructions() {
1243            if instr.get_opcode() == inkwell::values::InstructionOpcode::Call && is_real_call(instr)
1244            {
1245                has_calls = true;
1246                break;
1247            }
1248        }
1249        if has_calls {
1250            break;
1251        }
1252    }
1253    emitter.has_calls = has_calls;
1254
1255    // Determine which callee-saved registers are used (shrink wrapping).
1256    if !is_main && emitter.config.shrink_wrap_enabled {
1257        let num_params = function.count_params() as usize;
1258        let mut used = [false; 4];
1259
1260        // Parameters mapped to r9-r12 count as used.
1261        for u in used
1262            .iter_mut()
1263            .take(crate::abi::MAX_LOCAL_REGS.min(num_params))
1264        {
1265            *u = true;
1266        }
1267
1268        // If the function contains any call instruction, all callee-saved regs are used
1269        // (because the callee may clobber them and expects us to preserve them).
1270        if has_calls {
1271            used = [true; 4];
1272        } else {
1273            // Check usage for non-call instructions if we assume registers might be used.
1274            // But since we don't allocate r9-r12 as temps, they are only used for params.
1275            // So `used` is already correct.
1276        }
1277
1278        emitter.used_callee_regs = used;
1279
1280        // Compute frame offsets for saved callee-saved registers.
1281        // Layout: [ra (optional), then used callee-saved regs contiguously].
1282        let mut offset = if has_calls { 8i32 } else { 0i32 }; // Reserve space for ra only if needed
1283        let mut offsets = [None; 4];
1284        for i in 0..4 {
1285            if used[i] {
1286                offsets[i] = Some(offset);
1287                offset += 8;
1288            }
1289        }
1290        emitter.callee_save_offsets = offsets;
1291        emitter.next_slot_offset = offset;
1292    }
1293    // When shrink wrapping is disabled or is_main, keep defaults (all regs, FRAME_HEADER_SIZE).
1294
1295    // Compute single-predecessor map for cross-block register cache.
1296    if emitter.config.cross_block_cache_enabled && emitter.config.register_cache_enabled {
1297        let blocks = function.get_basic_blocks();
1298        let mut pred_count: HashMap<BasicBlock<'ctx>, usize> = HashMap::new();
1299        let mut pred_from: HashMap<BasicBlock<'ctx>, BasicBlock<'ctx>> = HashMap::new();
1300
1301        for bb in &blocks {
1302            if let Some(term) = bb.get_terminator() {
1303                let successors = super::successors::collect_successors(term);
1304                // Deduplicate successors per predecessor (e.g. switch cases targeting the same block)
1305                // so that multiple edges from the same bb don't inflate the predecessor count.
1306                let unique_succs: HashSet<_> = successors.into_iter().collect();
1307                for succ in unique_succs {
1308                    let count = pred_count.entry(succ).or_insert(0);
1309                    *count += 1;
1310                    pred_from.insert(succ, *bb);
1311                }
1312            }
1313        }
1314
1315        for (bb, count) in &pred_count {
1316            if *count == 1
1317                && let Some(pred) = pred_from.get(bb)
1318            {
1319                emitter.block_single_pred.insert(*bb, *pred);
1320            }
1321        }
1322    }
1323
1324    // Allocate slots for function parameters.
1325    for param in function.get_params() {
1326        let key = val_key_basic(param);
1327        emitter.alloc_slot_for_key(key);
1328    }
1329
1330    // Allocate labels for all basic blocks.
1331    for bb in function.get_basic_blocks() {
1332        let label = emitter.alloc_label();
1333        emitter.block_labels.insert(bb, label);
1334    }
1335
1336    // Allocate slots for instruction results that produce integer values.
1337    for bb in function.get_basic_blocks() {
1338        for instr in bb.get_instructions() {
1339            if instruction_produces_value(instr) {
1340                let key = val_key_instr(instr);
1341                emitter.alloc_slot_for_key(key);
1342            }
1343        }
1344    }
1345}
1346
1347fn instruction_produces_value(instr: InstructionValue<'_>) -> bool {
1348    use inkwell::values::InstructionOpcode;
1349    matches!(
1350        instr.get_opcode(),
1351        InstructionOpcode::Add
1352            | InstructionOpcode::Sub
1353            | InstructionOpcode::Mul
1354            | InstructionOpcode::UDiv
1355            | InstructionOpcode::SDiv
1356            | InstructionOpcode::URem
1357            | InstructionOpcode::SRem
1358            | InstructionOpcode::And
1359            | InstructionOpcode::Or
1360            | InstructionOpcode::Xor
1361            | InstructionOpcode::Shl
1362            | InstructionOpcode::LShr
1363            | InstructionOpcode::AShr
1364            | InstructionOpcode::ICmp
1365            | InstructionOpcode::ZExt
1366            | InstructionOpcode::SExt
1367            | InstructionOpcode::Trunc
1368            | InstructionOpcode::Select
1369            | InstructionOpcode::Phi
1370            | InstructionOpcode::Load
1371            | InstructionOpcode::Call
1372    )
1373}
1374
1375/// Check whether an LLVM Call instruction is a "real" function call that uses the
1376/// calling convention and may clobber callee-saved registers (r9-r12).
1377///
1378/// PVM intrinsics (`__pvm_load_*`, `__pvm_store_*`, `__pvm_memory_*`, etc.) and
1379/// LLVM intrinsics (`llvm.*`) are lowered inline — they only use temp/scratch
1380/// registers and never clobber callee-saved registers.
1381///
1382/// Real calls: `wasm_func_*` (direct) and `__pvm_call_indirect` (indirect).
1383pub(super) fn is_real_call(instr: InstructionValue<'_>) -> bool {
1384    let call_site: std::result::Result<inkwell::values::CallSiteValue, _> = instr.try_into();
1385    let Ok(call_site) = call_site else {
1386        return true; // Conservative: treat as call if we can't classify.
1387    };
1388    let Some(fn_val) = call_site.get_called_fn_value() else {
1389        return true; // Indirect call without known callee.
1390    };
1391    let name = fn_val.get_name().to_string_lossy();
1392    // PVM intrinsics and LLVM intrinsics are NOT real calls.
1393    if name.starts_with("__pvm_") && name != "__pvm_call_indirect" {
1394        return false;
1395    }
1396    if name.starts_with("llvm.") {
1397        return false;
1398    }
1399    true
1400}
1401
1402/// Check whether any instruction in the function will lower to a path that
1403/// clobbers r5/r6 (`abi::SCRATCH1`/`abi::SCRATCH2`).
1404///
1405/// Returns `true` if the function is safe (no r5/r6 clobbers).
1406///
1407/// Operations that clobber r5/r6:
1408/// - `__pvm_memory_grow`, `__pvm_memory_fill`, `__pvm_memory_copy`, `__pvm_memory_init`
1409/// - `llvm.fshl.*`, `llvm.fshr.*` (funnel shifts — conservatively flagged even if
1410///   they might lower to a rotation which doesn't clobber)
1411pub fn scratch_regs_safe(function: FunctionValue<'_>) -> bool {
1412    use inkwell::values::InstructionOpcode;
1413
1414    for bb in function.get_basic_blocks() {
1415        for instr in bb.get_instructions() {
1416            if instr.get_opcode() == InstructionOpcode::Call {
1417                let call_site: std::result::Result<inkwell::values::CallSiteValue, _> =
1418                    instr.try_into();
1419                let Ok(cs) = call_site else {
1420                    continue;
1421                };
1422                let Some(fn_val) = cs.get_called_fn_value() else {
1423                    continue;
1424                };
1425                let name = fn_val.get_name().to_string_lossy();
1426                // Bulk memory intrinsics clobber r5/r6.
1427                if matches!(
1428                    name.as_ref(),
1429                    "__pvm_memory_grow"
1430                        | "__pvm_memory_fill"
1431                        | "__pvm_memory_copy"
1432                        | "__pvm_memory_init"
1433                ) {
1434                    return false;
1435                }
1436                // Funnel shifts (conservatively including rotations).
1437                if name.starts_with("llvm.fshl.") || name.starts_with("llvm.fshr.") {
1438                    return false;
1439                }
1440            }
1441        }
1442    }
1443    true
1444}
1445
1446// Re-export scratch registers for other modules
1447pub use crate::abi::{ARGS_LEN_REG as SCRATCH1, ARGS_PTR_REG as SCRATCH2};