Skip to main content

relon_codegen_llvm/codegen/
mod.rs

1//! IR -> LLVM IR lowering.
2//!
3//! Phase B widens the emitter past the Phase A bootstrap envelope:
4//!
5//! - Two entry shapes:
6//!   - **Legacy-i64**: `(I64...) -> I64` — driven by
7//!     [`LlvmAotEvaluator::from_ir_direct`]. Mirrors the cranelift
8//!     crate's same-named envelope; used by the Phase A bootstrap
9//!     tests and the side-by-side `from_ir_direct` benchmarks.
10//!   - **Buffer-protocol**: `(*state, i32 in_ptr, i32 in_len,
11//!     i32 out_ptr, i32 out_cap, i64 caps) -> i32` — driven by
12//!     [`LlvmAotEvaluator::from_source`]. Matches what
13//!     `lower_workspace_single` emits for every user source.
14//!
15//! - Op set widened to the W1 / W2 production-source surface:
16//!   `LocalGet`, `ConstI64` / `ConstI32` / `ConstBool`, `LetGet` /
17//!   `LetSet`, `LoadField` / `StoreField` (scalar slots: I32 / I64 /
18//!   F64 / Bool / Unit), `Add` / `Sub` / `Mul` / `Div` / `Mod` /
19//!   `BitAnd` (`I32` and `I64`), comparison ops (`Eq` / `Ne` /
20//!   `Lt` / `Le` / `Gt` / `Ge` — `I32` / `I64` / `Bool` for `Eq`/`Ne`),
21//!   structured control flow (`Block` / `Loop` / `Br` / `BrIf` /
22//!   `If`), and `Return`.
23//!
24//! Ops outside the Phase B envelope (stdlib `Call`, pointer-indirect
25//! `StoreField`, `MakeClosure`, sandbox-trap helpers, schema-method
26//! dispatch, …) surface as [`crate::LlvmError::Codegen`]. They are
27//! tracked for Phase C.
28//!
29//! ## Control-flow lowering vs cranelift
30//!
31//! Cranelift's `block-with-params` keeps phi nodes implicit (every
32//! branch passes the carried values as block arguments). LLVM IR
33//! requires explicit `phi` nodes per joining basic block. We avoid
34//! both by spilling the IR stack through `alloca` slots whenever
35//! control flow joins, and reading them back on the consumer side.
36//! That mirrors how a naive byte-code-to-LLVM emitter behaves and
37//! relies on LLVM's `mem2reg` pass at -O2/-O3 to turn the alloca
38//! reads back into SSA values + phis. For the W1 / W2 hot loops
39//! `mem2reg` collapses the alloca traffic into a single
40//! loop-carried IR value (verified via `emit_ir_dump`'s output at
41//! `-O2`).
42//!
43//! ## Stack discipline
44//!
45//! The IR's stack machine carries one value per push. We track the
46//! per-op operand stack as `Vec<IntValue>` (every IR value the W1/W2
47//! envelope produces fits in an integer type — I32 for Bool / I32-
48//! tagged values, I64 for I64-tagged values). The wasm-style "every
49//! value above the operand stack is unreachable after `br`" rule
50//! lets us drop unconsumed stack slots silently — LLVM's verifier
51//! catches missing terminators if we forget to seal a block.
52
53use std::collections::HashMap;
54
55use inkwell::builder::Builder;
56use inkwell::context::Context;
57use inkwell::module::{Linkage, Module as LlvmModule};
58use inkwell::types::{BasicMetadataTypeEnum, BasicTypeEnum};
59use inkwell::values::{BasicValue, BasicValueEnum, FunctionValue, IntValue, PointerValue};
60use inkwell::{AddressSpace, IntPredicate};
61
62use relon_ir::ir::{Func, IrType, Module as IrModule, Op, TaggedOp};
63
64use crate::error::LlvmError;
65use crate::state::{ARENA_STATE_OFFSET_BASE, ARENA_STATE_OFFSET_TAIL_CURSOR};
66
67// Per-`Op`-family lowering modules. Each holds an
68// `impl<'ctx, 'b, 'cp> Emit<'ctx, 'b, 'cp>` block with the `emit_*`
69// methods for that family; the exhaustive `lower_op` dispatch below
70// delegates to them. Mirrors the cranelift backend's `codegen/*`
71// split so Phase 0b can fill unimplemented families in place without
72// colliding. (Behavior-preserving reorg — Phase 0a.)
73mod arith;
74mod call;
75mod closure;
76mod collections;
77mod control;
78mod mem;
79mod schema;
80mod string;
81mod unicode;
82
83// Family-local enums consumed by the central `lower_op` dispatch.
84use arith::BinOp;
85use mem::{AbsLoad, AbsStore};
86
87/// Canonical export name the entry function uses in the emitted LLVM
88/// module. The evaluator side `dlsym`s / `get_function`s against this
89/// symbol after JIT finalize, so renaming it requires touching both
90/// crates simultaneously.
91pub(crate) const ENTRY_SYMBOL: &str = "relon_llvm_entry";
92
93/// Tag a `load` instruction with `!invariant.load !{}` so LLVM treats
94/// every load from the address as returning the same value for the
95/// instruction's lifetime — letting GVN/LICM hoist it out of loops and
96/// collapse redundant reloads.
97///
98/// SOUND ONLY for genuinely call-invariant memory. The single caller is
99/// the per-entry / per-lambda `state.arena_base` word load
100/// (`ARENA_STATE_OFFSET_BASE`): the host fills the base pointer into the
101/// `ArenaState` struct *before* the entry runs and never mutates it for
102/// the call's duration (only the scratch / tail cursors at later offsets
103/// are written — see `state.rs`; no `build_store` ever targets offset 0).
104/// Without this tag LLVM reloads the base from the opaque state pointer on
105/// every arena access inside a loop (the W20 n-body inner loop showed a
106/// `mov (%state), %base` reload per pair access), because it cannot prove
107/// the intervening arena stores don't alias the state struct. The tag is
108/// metadata only — it changes no value, so every backend stays
109/// bit-identical.
110fn mark_invariant_load(ctx: &Context, loaded: BasicValueEnum<'_>) {
111    if let Some(inst) = loaded.as_instruction_value() {
112        let kind_id = ctx.get_kind_id("invariant.load");
113        let empty = ctx.metadata_node(&[]);
114        let _ = inst.set_metadata(empty, kind_id);
115    }
116}
117
118/// Phase D.1 dispatch-boundary fast path: a second exported entry
119/// emitted alongside the buffer-protocol entry whenever the source's
120/// `#main(Int...) -> Int` shape qualifies. Skips the HashMap pack +
121/// arena round-trip the buffer envelope incurs, dropping the per-call
122/// boundary cost from the ~650 ns band into the rust-native ballpark.
123///
124/// Only resolved when the evaluator's [`FastPathProfile`] is `Some`;
125/// the symbol is absent from the JIT module otherwise.
126pub(crate) const ENTRY_SYMBOL_FAST: &str = "relon_llvm_entry_fast";
127
128/// Which signature the LLVM emitter should generate. Mirrors the
129/// cranelift crate's `EntryShape` enum so a side-by-side comparison
130/// of the two backends shares the same vocabulary.
131#[derive(Debug, Clone, Copy, PartialEq, Eq)]
132pub(crate) enum EntryShape {
133    /// `(I64...) -> I64`. The Phase A bootstrap envelope — used by
134    /// `from_ir_direct` callers (tests, helloworld_arith fixtures).
135    LegacyI64,
136    /// `(*state, i32 in_ptr, i32 in_len, i32 out_ptr, i32 out_cap,
137    /// i64 caps) -> i32`. The shape `lower_workspace_single`
138    /// synthesises for every user `#main` source. State is the
139    /// first parameter to match the cranelift backend's
140    /// `BufferEntryFn` layout.
141    Buffer,
142}
143
144/// Stage 1.B: whether `Op::CallNative` lowers to **open-world**
145/// dynamic dispatch (the `relon_llvm_call_native` helper resolved at
146/// runtime via `add_global_mapping`) or **closed-world** static
147/// dispatch (a direct `call @<host_symbol>` to an `extern` declaration
148/// the LTO co-compile step later links + inlines).
149///
150/// `OpenWorld` is the default and the only path MCJIT / `from_source`
151/// ever uses — it must stay reachable verbatim. `ClosedWorld` is only
152/// selected by the co-compile orchestration (`crate::cocompile`) when
153/// the full host-fn set is known at emit time (the build.rs /
154/// `emit_object` path), mirroring cranelift's *static* `cap_lookup ->
155/// fn_ptr` arm rather than its `_dynamic` helper arm.
156#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
157pub enum WorldMode {
158    /// Dynamic dispatch through `relon_llvm_call_native`. Default so
159    /// the existing MCJIT / `from_source` path is untouched.
160    #[default]
161    OpenWorld,
162    /// Static `call @<host_symbol>` to an external declaration. The
163    /// host bitcode is linked in + inlined by the LTO co-compile pass.
164    ClosedWorld,
165}
166
167/// Phase D.1 fast-path profile: describes a `#main(Int...) -> Int`
168/// source shape eligible for the typed legacy-i64 dispatch fast path.
169///
170/// The profile maps each declared `#main` Int parameter's buffer
171/// offset to the LLVM fast entry's i64 positional slot, and records
172/// the offset of the single Int return slot so the trailing
173/// `StoreField` can be rewritten into a `ret`. Used exclusively by
174/// [`emit_fast_entry`].
175#[derive(Debug, Clone)]
176pub(crate) struct FastPathProfile {
177    /// One entry per declared `#main` arg: the field's byte offset in
178    /// the input buffer (matches what `LoadField { offset }` carries
179    /// in the IR body) and the i64 slot index in the fast entry
180    /// signature. Vector order parallels schema declaration order.
181    pub(crate) arg_offsets: Vec<u32>,
182    /// Byte offset of the single `value` field in the return buffer.
183    /// The trailing `StoreField { offset, ty: I64 }` whose offset
184    /// matches this value gets rewritten into a `ret` on the value
185    /// (after popping the IR stack normally). Any other `StoreField`
186    /// surfaces as an emitter error — the fast path only handles
187    /// single-value-wrapper returns.
188    pub(crate) ret_offset: u32,
189}
190
191/// Phase E.1: per-module const-pool blob laid out at compile time and
192/// copied into the arena prefix on every dispatch. Mirrors
193/// `relon_codegen_cranelift::codegen::ConstPool` (shape only — the LLVM
194/// side keeps it scoped to this crate so the dep direction stays
195/// one-way).
196///
197/// Layout: `[len: u32 LE][utf8 bytes]` records emitted in IR-walk
198/// order, aligned to 4. Each `Op::ConstString { idx }` resolves to
199/// `string_offsets[idx]` — the byte offset of its record inside
200/// [`Self::bytes`] (= the arena-relative offset once the host has
201/// copied the blob to the arena prefix).
202#[derive(Debug, Default, Clone)]
203pub struct ConstPool {
204    /// `idx -> byte offset within `bytes`. The emitter materialises
205    /// `Op::ConstString { idx }` as `iconst(I32, string_offsets[idx])`.
206    pub string_offsets: std::collections::HashMap<u32, u32>,
207    /// `List<Int>` pool: `idx -> byte offset`. Mirrors cranelift's
208    /// `ConstPool::list_int_offsets`; record layout is
209    /// `[len: u32 LE][pad: u32][i64 elements LE]`, aligned to 8.
210    pub list_int_offsets: std::collections::HashMap<u32, u32>,
211    /// `List<Float>` pool: `idx -> byte offset`. Same layout as
212    /// `list_int_offsets` (f64 elements stored as their u64 LE
213    /// bit-pattern), aligned to 8.
214    pub list_float_offsets: std::collections::HashMap<u32, u32>,
215    /// `List<Bool>` pool: `idx -> byte offset`. Record layout is
216    /// `[len: u32 LE][u8 booleans]` (tightly packed), aligned to 4.
217    pub list_bool_offsets: std::collections::HashMap<u32, u32>,
218    /// W5-P2: `List<String>` pointer-array pool: `idx -> header byte
219    /// offset`. Record layout (byte-identical to cranelift's
220    /// `visit_const_list_string`): each element's `[slen: u32 LE][utf8]`
221    /// String record is emitted first (4-aligned), then the header
222    /// `[len: u32 LE][off_0: u32 LE]...[off_{N-1}: u32 LE]` whose
223    /// `off_i` is the arena-relative offset of String record `i`.
224    pub list_string_offsets: std::collections::HashMap<u32, u32>,
225    /// W5-P1/P3: `{String -> Int}` dict pool: `idx -> record byte
226    /// offset`. Record layout (byte-identical to cranelift's
227    /// `visit_const_dict`): `[entry_count: u32 LE][pad: u32][shape_hash:
228    /// u64 LE]` header, a `[key_off: u32][key_len: u32][value: i64]`
229    /// entry table sorted by key bytes, then the concatenated UTF-8 key
230    /// payload (`key_off` record-relative). The W5-P3 dict-get probe
231    /// binary-/linear-searches this table at runtime.
232    pub dict_offsets: std::collections::HashMap<u32, u32>,
233    /// Wave R14: Unicode `*TableAddr` pool. Each distinct
234    /// [`unicode::UnicodeTable`] referenced anywhere in the module (incl.
235    /// inlined bundled-stdlib helper bodies) is encoded once via the
236    /// shared `relon_ir` encoders and laid into [`Self::bytes`]; the
237    /// `*TableAddr` op resolves to the recorded arena-relative offset.
238    /// Byte-identical to cranelift's per-table `ConstPool` slots.
239    pub(crate) unicode_table_offsets: std::collections::HashMap<unicode::UnicodeTable, u32>,
240    /// Materialised bytes in record order. The host trampoline copies
241    /// these verbatim to `arena[..bytes.len()]` before every dispatch.
242    pub bytes: Vec<u8>,
243}
244
245impl ConstPool {
246    /// Build the pool by walking every function body in `module` and
247    /// collecting each unique `Op::ConstString { idx, value }`. Records
248    /// are laid out in walk-order with 4-byte alignment.
249    pub fn from_module(module: &IrModule) -> Result<Self, LlvmError> {
250        let mut pool = ConstPool::default();
251        for func in &module.funcs {
252            pool.collect_body(&func.body)?;
253        }
254        Ok(pool)
255    }
256
257    fn collect_body(&mut self, body: &[TaggedOp]) -> Result<(), LlvmError> {
258        for tagged in body {
259            self.collect_op(&tagged.op)?;
260        }
261        Ok(())
262    }
263
264    fn collect_op(&mut self, op: &Op) -> Result<(), LlvmError> {
265        match op {
266            Op::ConstString { idx, value } => self.add_string(*idx, value),
267            Op::ConstListInt { idx, elements } => self.add_list_int(*idx, elements),
268            Op::ConstListFloat { idx, elements } => self.add_list_float(*idx, elements),
269            Op::ConstListBool { idx, elements } => self.add_list_bool(*idx, elements),
270            Op::ConstListString { idx, elements } => self.add_list_string(*idx, elements),
271            Op::ConstDict { idx, entries } => self.add_dict(*idx, entries),
272            Op::Block { body, .. } | Op::Loop { body, .. } => self.collect_body(body),
273            Op::If {
274                then_body,
275                else_body,
276                ..
277            } => {
278                self.collect_body(then_body)?;
279                self.collect_body(else_body)
280            }
281            // Op::Call inlines a bundled-stdlib body whose own
282            // `Op::ConstString` literals must also land in the pool —
283            // mirror cranelift's recursion through `builtin_stdlib`.
284            Op::Call { fn_index, .. } => {
285                let stdlib = relon_ir::stdlib::builtin_stdlib();
286                if let Some(callee) = stdlib.get(*fn_index as usize) {
287                    let body = callee.body_owned();
288                    self.collect_body(&body)?;
289                }
290                Ok(())
291            }
292            // Wave R14: Unicode `*TableAddr` ops. Lay each referenced
293            // table into the const prefix once (deduped by table identity)
294            // so the lowering resolves to a fixed offset instead of
295            // copying the table into scratch per op-execution.
296            other => {
297                if let Some(table) = unicode::UnicodeTable::from_op(other) {
298                    self.add_unicode_table(table)?;
299                }
300                Ok(())
301            }
302        }
303    }
304
305    /// Lay `table`'s encoded bytes into the pool on first reference and
306    /// record the arena-relative offset. The byte encoder is the exact
307    /// shared `relon_ir` function cranelift's `ConstPool` calls, so the
308    /// data a lookup helper reads is byte-identical across backends.
309    /// Aligned to 4 to match every `*TableAddr` slot on the cranelift
310    /// side (the table headers are read with 4-byte-aligned i32 loads).
311    fn add_unicode_table(&mut self, table: unicode::UnicodeTable) -> Result<(), LlvmError> {
312        if self.unicode_table_offsets.contains_key(&table) {
313            return Ok(());
314        }
315        self.align_to(4);
316        let off = u32::try_from(self.bytes.len())
317            .map_err(|_| LlvmError::Codegen("llvm const pool exceeds u32 range".into()))?;
318        let bytes = table.encode_bytes();
319        self.bytes.extend_from_slice(&bytes);
320        self.unicode_table_offsets.insert(table, off);
321        Ok(())
322    }
323
324    fn add_string(&mut self, idx: u32, value: &str) -> Result<(), LlvmError> {
325        if self.string_offsets.contains_key(&idx) {
326            return Ok(());
327        }
328        // Align to 4 so the `[len: u32]` header lands on a 4-byte
329        // boundary — i32 loads through the JIT use `align=4` and we
330        // don't want an unaligned trap on hosts where it matters.
331        self.align_to(4);
332        let off = u32::try_from(self.bytes.len())
333            .map_err(|_| LlvmError::Codegen("llvm const pool exceeds u32 range".into()))?;
334        let len = u32::try_from(value.len())
335            .map_err(|_| LlvmError::Codegen("ConstString length exceeds u32 range".into()))?;
336        self.bytes.extend_from_slice(&len.to_le_bytes());
337        self.bytes.extend_from_slice(value.as_bytes());
338        self.string_offsets.insert(idx, off);
339        Ok(())
340    }
341
342    /// Pad `bytes` up to the next `align` boundary with zero fill.
343    /// Mirrors cranelift's `ConstPool::align_to`.
344    fn align_to(&mut self, align: usize) {
345        let rem = self.bytes.len() % align;
346        if rem != 0 {
347            self.bytes.resize(self.bytes.len() + (align - rem), 0);
348        }
349    }
350
351    /// Lay out a `List<Int>` record. Byte layout
352    /// `[len: u32 LE][pad: u32 zero][i64 elements LE]`, aligned to 8 —
353    /// byte-identical to cranelift's `visit_const_list_int` (cross-
354    /// backend arena data contract).
355    fn add_list_int(&mut self, idx: u32, elements: &[i64]) -> Result<(), LlvmError> {
356        if self.list_int_offsets.contains_key(&idx) {
357            return Ok(());
358        }
359        self.align_to(8);
360        let off = u32::try_from(self.bytes.len())
361            .map_err(|_| LlvmError::Codegen("llvm const pool exceeds u32 range".into()))?;
362        let len = u32::try_from(elements.len())
363            .map_err(|_| LlvmError::Codegen("ConstListInt length exceeds u32 range".into()))?;
364        self.bytes.extend_from_slice(&len.to_le_bytes());
365        self.bytes.extend_from_slice(&[0u8; 4]); // pad to 8
366        for e in elements {
367            self.bytes.extend_from_slice(&e.to_le_bytes());
368        }
369        self.list_int_offsets.insert(idx, off);
370        Ok(())
371    }
372
373    /// Lay out a `List<Float>` record. Same layout as `add_list_int`
374    /// (f64 elements stored as their u64 LE bit-pattern), aligned to 8 —
375    /// byte-identical to cranelift's `visit_const_list_float`.
376    fn add_list_float(&mut self, idx: u32, elements: &[u64]) -> Result<(), LlvmError> {
377        if self.list_float_offsets.contains_key(&idx) {
378            return Ok(());
379        }
380        self.align_to(8);
381        let off = u32::try_from(self.bytes.len())
382            .map_err(|_| LlvmError::Codegen("llvm const pool exceeds u32 range".into()))?;
383        let len = u32::try_from(elements.len())
384            .map_err(|_| LlvmError::Codegen("ConstListFloat length exceeds u32 range".into()))?;
385        self.bytes.extend_from_slice(&len.to_le_bytes());
386        self.bytes.extend_from_slice(&[0u8; 4]); // pad to 8
387        for e in elements {
388            self.bytes.extend_from_slice(&e.to_le_bytes());
389        }
390        self.list_float_offsets.insert(idx, off);
391        Ok(())
392    }
393
394    /// Lay out a `List<Bool>` record. Byte layout
395    /// `[len: u32 LE][u8 booleans]` (tightly packed), aligned to 4 —
396    /// byte-identical to cranelift's `visit_const_list_bool`.
397    fn add_list_bool(&mut self, idx: u32, elements: &[bool]) -> Result<(), LlvmError> {
398        if self.list_bool_offsets.contains_key(&idx) {
399            return Ok(());
400        }
401        self.align_to(4);
402        let off = u32::try_from(self.bytes.len())
403            .map_err(|_| LlvmError::Codegen("llvm const pool exceeds u32 range".into()))?;
404        let len = u32::try_from(elements.len())
405            .map_err(|_| LlvmError::Codegen("ConstListBool length exceeds u32 range".into()))?;
406        self.bytes.extend_from_slice(&len.to_le_bytes());
407        for e in elements {
408            self.bytes.push(if *e { 1 } else { 0 });
409        }
410        self.list_bool_offsets.insert(idx, off);
411        Ok(())
412    }
413
414    /// W5-P2: lay out a `List<String>` pointer-array record. Each
415    /// element's `[slen: u32 LE][utf8]` String record is emitted first
416    /// (4-aligned), then the header `[len: u32 LE][off_0: u32 LE]...`
417    /// whose `off_i` is the arena-relative offset of String record `i`.
418    /// Byte-identical to cranelift's `visit_const_list_string` (cross-
419    /// backend arena data contract); the `idx -> header offset` map is
420    /// what `Op::ConstListString` resolves to.
421    fn add_list_string(&mut self, idx: u32, elements: &[String]) -> Result<(), LlvmError> {
422        if self.list_string_offsets.contains_key(&idx) {
423            return Ok(());
424        }
425        self.align_to(4);
426        let mut str_offsets: Vec<u32> = Vec::with_capacity(elements.len());
427        for s in elements {
428            self.align_to(4);
429            let s_off = u32::try_from(self.bytes.len()).map_err(|_| {
430                LlvmError::Codegen("ConstListString string offset exceeds u32".into())
431            })?;
432            let slen = u32::try_from(s.len()).map_err(|_| {
433                LlvmError::Codegen("ConstListString element length exceeds u32".into())
434            })?;
435            self.bytes.extend_from_slice(&slen.to_le_bytes());
436            self.bytes.extend_from_slice(s.as_bytes());
437            str_offsets.push(s_off);
438        }
439        self.align_to(4);
440        let header_off = u32::try_from(self.bytes.len())
441            .map_err(|_| LlvmError::Codegen("ConstListString header offset exceeds u32".into()))?;
442        let len = u32::try_from(elements.len())
443            .map_err(|_| LlvmError::Codegen("ConstListString length exceeds u32".into()))?;
444        self.bytes.extend_from_slice(&len.to_le_bytes());
445        for off in &str_offsets {
446            self.bytes.extend_from_slice(&off.to_le_bytes());
447        }
448        self.list_string_offsets.insert(idx, header_off);
449        Ok(())
450    }
451
452    /// W5-P1/P3: lay out a `{String -> Int}` dict record. Byte-identical
453    /// to cranelift's `const_pool::visit_const_dict` (cross-backend
454    /// arena data contract) so the W5-P3 dict-get probe reads the same
455    /// bytes on either backend:
456    ///
457    /// ```text
458    /// [entry_count: u32 LE][pad: u32][shape_hash: u64 LE]   ; 16-byte header
459    /// entry_count × [key_off: u32 LE][key_len: u32 LE][value: i64 LE]
460    /// concatenated UTF-8 key bytes                          ; key_off record-rel
461    /// ```
462    ///
463    /// The entry table is sorted by key bytes (deterministic + probe-
464    /// friendly); the record start is 8-aligned so the i64 values + the
465    /// u64 shape_hash land on natural boundaries.
466    fn add_dict(&mut self, idx: u32, entries: &[(String, i64)]) -> Result<(), LlvmError> {
467        if self.dict_offsets.contains_key(&idx) {
468            return Ok(());
469        }
470        self.align_to(8);
471        let off = u32::try_from(self.bytes.len())
472            .map_err(|_| LlvmError::Codegen("llvm const pool exceeds u32 range".into()))?;
473
474        let mut sorted: Vec<&(String, i64)> = entries.iter().collect();
475        sorted.sort_by(|a, b| a.0.as_bytes().cmp(b.0.as_bytes()));
476
477        let entry_count = u32::try_from(sorted.len())
478            .map_err(|_| LlvmError::Codegen("ConstDict entry count exceeds u32".into()))?;
479        let shape_hash =
480            relon_ir::shape_hash::shape_hash_for_keys(sorted.iter().map(|(k, _)| k.as_str()));
481
482        // Header.
483        self.bytes.extend_from_slice(&entry_count.to_le_bytes());
484        self.bytes.extend_from_slice(&[0u8; 4]); // pad: keep shape_hash 8-aligned
485        self.bytes.extend_from_slice(&shape_hash.to_le_bytes());
486
487        const HEADER_BYTES: u32 = 16;
488        const ENTRY_BYTES: u32 = 16;
489        let table_bytes = entry_count
490            .checked_mul(ENTRY_BYTES)
491            .ok_or_else(|| LlvmError::Codegen("ConstDict table size overflow".into()))?;
492        let key_payload_base = HEADER_BYTES
493            .checked_add(table_bytes)
494            .ok_or_else(|| LlvmError::Codegen("ConstDict key base overflow".into()))?;
495
496        // Entry table. key_off is record-relative; accumulate as we go.
497        let mut running_key_off = key_payload_base;
498        for (key, value) in &sorted {
499            let key_len = u32::try_from(key.len())
500                .map_err(|_| LlvmError::Codegen("ConstDict key length exceeds u32".into()))?;
501            self.bytes.extend_from_slice(&running_key_off.to_le_bytes());
502            self.bytes.extend_from_slice(&key_len.to_le_bytes());
503            self.bytes.extend_from_slice(&value.to_le_bytes());
504            running_key_off = running_key_off
505                .checked_add(key_len)
506                .ok_or_else(|| LlvmError::Codegen("ConstDict key offset overflow".into()))?;
507        }
508
509        // Key payload.
510        for (key, _) in &sorted {
511            self.bytes.extend_from_slice(key.as_bytes());
512        }
513
514        self.dict_offsets.insert(idx, off);
515        Ok(())
516    }
517}
518
519/// IR param signature that triggers [`EntryShape::Buffer`]. Mirrors
520/// `is_buffer_protocol_signature` on the cranelift side.
521pub(crate) fn is_buffer_protocol_signature(params: &[IrType], ret: IrType) -> bool {
522    matches!(
523        params,
524        [
525            IrType::I32,
526            IrType::I32,
527            IrType::I32,
528            IrType::I32,
529            IrType::I64
530        ]
531    ) && matches!(ret, IrType::I32)
532}
533
534/// Phase E.2 multi-function emit: lower every reachable IR function
535/// into LLVM. The entry function `entry` is emitted under either the
536/// legacy-i64 or buffer-protocol shape; each entry in `helpers` is
537/// emitted as a sibling helper function with a plain typed
538/// `(params...) -> ret` signature so the entry's `Op::Call` lowering
539/// can route to it through a direct LLVM `call` instruction.
540///
541/// `helper_ir_indices` parallels `helpers`: entry `i` carries the
542/// IR-side `funcs` index for the matching helper. Used by the
543/// `Op::Call` lowering to resolve `fn_index - stdlib_count` back to the
544/// matching `FunctionValue`.
545///
546/// Phase F.W7 widens the surface to closures-as-values:
547///
548/// - `lambdas` carries the IR funcs the lowering pass appended to the
549///   module's closure table (`#main`-side `fib: (k) => ...` lifts to a
550///   lambda Func). Each lambda is declared / emitted with the
551///   signature `(state, captures_ptr, ...lambda.params[1..]) -> ret`
552///   so the body's `LocalGet(0)` reads the captures_ptr arg, and so
553///   `Op::AllocScratch` / `*AtAbsolute` ops inside the body can reach
554///   the per-call arena state.
555/// - `closure_table` mirrors the IR's `Module::closure_table` so the
556///   emitter knows which `fn_table_idx` resolves to which lambda
557///   `FunctionValue`. Returned alongside `helper_table` so the
558///   `Op::MakeClosure` / `Op::CallClosure` lowering can refer to it.
559///
560/// `const_pool` ships the per-module ConstString blob the entry +
561/// helper bodies index into via `Op::ConstString { idx }`. The host
562/// copies `const_pool.bytes` to the arena prefix before every
563/// dispatch so the materialised `iconst(I32, offset)` resolves to a
564/// stable address.
565///
566/// Returns the entry `FunctionValue`, the detected entry shape, the
567/// helper lookup table the `Emit` driver hands off to the per-function
568/// lowering so sibling calls can find their callee, and the closure
569/// table (one entry per `fn_table_idx`, in source order).
570/// Open-world entry point (the only one MCJIT / `from_source` use).
571/// `Op::CallNative` lowers to the dynamic `relon_llvm_call_native`
572/// helper. Signature kept stable so the `evaluator.rs` call sites are
573/// untouched.
574#[allow(clippy::too_many_arguments, clippy::type_complexity)]
575pub(crate) fn emit_module_funcs<'ctx>(
576    ctx: &'ctx Context,
577    module: &LlvmModule<'ctx>,
578    entry: &Func,
579    buffer_return_size: u32,
580    const_pool: &ConstPool,
581    helpers: &[&Func],
582    helper_ir_indices: Option<&[u32]>,
583    lambdas: &[&Func],
584    closure_table: &[u32],
585    imports: &[relon_ir::ir::NativeImport],
586) -> Result<
587    (
588        FunctionValue<'ctx>,
589        EntryShape,
590        HashMap<u32, FunctionValue<'ctx>>,
591        Vec<FunctionValue<'ctx>>,
592    ),
593    LlvmError,
594> {
595    emit_module_funcs_impl(
596        ctx,
597        module,
598        entry,
599        buffer_return_size,
600        const_pool,
601        helpers,
602        helper_ir_indices,
603        lambdas,
604        closure_table,
605        imports,
606        WorldMode::OpenWorld,
607        crate::CodegenTarget::Native,
608        &[],
609    )
610}
611
612/// P3 §2.2 wasm32 entry point. Same open-world dispatch as
613/// [`emit_module_funcs`] but targets wasm32 so an `Op::CallNative`
614/// lowers to a **wasm import** call ([`crate::wasi_host`]) instead of the
615/// native `relon_llvm_call_native` MCJIT helper. Used only by the
616/// `emit_object_for_target(.., CodegenTarget::Wasm32)` object-emit path.
617#[allow(clippy::too_many_arguments, clippy::type_complexity)]
618pub(crate) fn emit_module_funcs_wasm<'ctx>(
619    ctx: &'ctx Context,
620    module: &LlvmModule<'ctx>,
621    entry: &Func,
622    buffer_return_size: u32,
623    const_pool: &ConstPool,
624    helpers: &[&Func],
625    helper_ir_indices: Option<&[u32]>,
626    lambdas: &[&Func],
627    closure_table: &[u32],
628    imports: &[relon_ir::ir::NativeImport],
629) -> Result<
630    (
631        FunctionValue<'ctx>,
632        EntryShape,
633        HashMap<u32, FunctionValue<'ctx>>,
634        Vec<FunctionValue<'ctx>>,
635    ),
636    LlvmError,
637> {
638    emit_module_funcs_impl(
639        ctx,
640        module,
641        entry,
642        buffer_return_size,
643        const_pool,
644        helpers,
645        helper_ir_indices,
646        lambdas,
647        closure_table,
648        imports,
649        WorldMode::OpenWorld,
650        crate::CodegenTarget::Wasm32,
651        &[],
652    )
653}
654
655/// Stage 1.B closed-world entry point. `Op::CallNative` lowers to a
656/// static `call @<host_symbol>` against an `extern` declaration; the
657/// host bitcode is linked in + inlined by [`crate::cocompile`]. Used
658/// only by the co-compile orchestration — never by MCJIT / `from_source`.
659#[allow(clippy::too_many_arguments, clippy::type_complexity)]
660pub(crate) fn emit_module_funcs_closed_world<'ctx>(
661    ctx: &'ctx Context,
662    module: &LlvmModule<'ctx>,
663    entry: &Func,
664    buffer_return_size: u32,
665    const_pool: &ConstPool,
666    helpers: &[&Func],
667    helper_ir_indices: Option<&[u32]>,
668    lambdas: &[&Func],
669    closure_table: &[u32],
670    imports: &[relon_ir::ir::NativeImport],
671) -> Result<
672    (
673        FunctionValue<'ctx>,
674        EntryShape,
675        HashMap<u32, FunctionValue<'ctx>>,
676        Vec<FunctionValue<'ctx>>,
677    ),
678    LlvmError,
679> {
680    emit_module_funcs_impl(
681        ctx,
682        module,
683        entry,
684        buffer_return_size,
685        const_pool,
686        helpers,
687        helper_ir_indices,
688        lambdas,
689        closure_table,
690        imports,
691        WorldMode::ClosedWorld,
692        crate::CodegenTarget::Native,
693        &[],
694    )
695}
696
697/// P3 §2.2 wasm closed-world co-compile entry point. Like
698/// [`emit_module_funcs_closed_world`] but targets **wasm32**: a
699/// pure-compute `Op::CallNative` (an import whose `effectful_imports`
700/// flag is `false`) lowers to a direct `call @<host_symbol>` that the
701/// wasm host-shim co-compile ([`crate::cocompile::link_and_inline_host_shim_wasm`])
702/// links + inlines into the wasm unit, mirroring the native closed-world
703/// inline. An **effectful** import (flag `true` — capability-gated by a
704/// preceding `Op::CheckCap`) instead routes to a **wasm import** call
705/// ([`crate::wasi_host`]) so the side effect crosses the sandbox boundary
706/// back out to the trusted host (ADR §2.2: pure inline, effectful → WASI).
707///
708/// `effectful_imports[i]` is the per-`import_idx` effectful flag; the
709/// caller (`emit_object_for_target`) derives it from the IR's
710/// CheckCap → CallNative shape.
711#[allow(clippy::too_many_arguments, clippy::type_complexity)]
712pub(crate) fn emit_module_funcs_closed_world_wasm<'ctx>(
713    ctx: &'ctx Context,
714    module: &LlvmModule<'ctx>,
715    entry: &Func,
716    buffer_return_size: u32,
717    const_pool: &ConstPool,
718    helpers: &[&Func],
719    helper_ir_indices: Option<&[u32]>,
720    lambdas: &[&Func],
721    closure_table: &[u32],
722    imports: &[relon_ir::ir::NativeImport],
723    effectful_imports: &[bool],
724) -> Result<
725    (
726        FunctionValue<'ctx>,
727        EntryShape,
728        HashMap<u32, FunctionValue<'ctx>>,
729        Vec<FunctionValue<'ctx>>,
730    ),
731    LlvmError,
732> {
733    emit_module_funcs_impl(
734        ctx,
735        module,
736        entry,
737        buffer_return_size,
738        const_pool,
739        helpers,
740        helper_ir_indices,
741        lambdas,
742        closure_table,
743        imports,
744        WorldMode::ClosedWorld,
745        crate::CodegenTarget::Wasm32,
746        effectful_imports,
747    )
748}
749
750#[allow(clippy::too_many_arguments, clippy::type_complexity)]
751fn emit_module_funcs_impl<'ctx>(
752    ctx: &'ctx Context,
753    module: &LlvmModule<'ctx>,
754    entry: &Func,
755    buffer_return_size: u32,
756    const_pool: &ConstPool,
757    helpers: &[&Func],
758    helper_ir_indices: Option<&[u32]>,
759    lambdas: &[&Func],
760    closure_table: &[u32],
761    imports: &[relon_ir::ir::NativeImport],
762    world_mode: WorldMode,
763    target: crate::CodegenTarget,
764    effectful_imports: &[bool],
765) -> Result<
766    (
767        FunctionValue<'ctx>,
768        EntryShape,
769        HashMap<u32, FunctionValue<'ctx>>,
770        Vec<FunctionValue<'ctx>>,
771    ),
772    LlvmError,
773> {
774    // Step 0: declare module-level intrinsics. `llvm.trap` is shared
775    // by every Div / Mod sandbox guard so a single declaration covers
776    // every per-op guard across every emitted function.
777    declare_llvm_trap(ctx, module);
778
779    // Step 1: declare every helper up-front so the entry / sibling
780    // bodies can resolve forward references (mutual recursion, the
781    // `fib(n - 1) + fib(n - 2)` self-call). LLVM is happy to issue
782    // `call @foo` against a declared-only function; the body is
783    // attached on the second pass.
784    let mut helper_table: HashMap<u32, FunctionValue<'ctx>> = HashMap::new();
785    if let Some(ir_indices) = helper_ir_indices {
786        if ir_indices.len() != helpers.len() {
787            return Err(LlvmError::Codegen(format!(
788                "emit_module_funcs: helpers.len()={} but helper_ir_indices.len()={}",
789                helpers.len(),
790                ir_indices.len()
791            )));
792        }
793    }
794    for (i, helper) in helpers.iter().enumerate() {
795        let fv = declare_helper_function(ctx, module, helper, i)?;
796        let ir_idx = helper_ir_indices.map(|v| v[i]).unwrap_or(i as u32);
797        helper_table.insert(ir_idx, fv);
798    }
799
800    // Phase F.W7: declare every lambda function up-front. Lambdas use
801    // a widened signature `(state, ...lambda.params) -> ret` — the
802    // first IR param (already `IrType::I32`, the captures_ptr the IR
803    // lowering pass prepended in `lower_closure_as_value`) becomes
804    // LLVM param 1 (just past the implicit `*state`). Subsequent
805    // user params shift to LLVM param indices 2.. so the body's
806    // `LocalGet(idx)` resolves to LLVM param `idx + 1`
807    // (`param_base = 1`).
808    let mut closure_fn_table: Vec<FunctionValue<'ctx>> = Vec::with_capacity(closure_table.len());
809    if lambdas.len() != closure_table.len() {
810        return Err(LlvmError::Codegen(format!(
811            "emit_module_funcs: lambdas.len()={} but closure_table.len()={}",
812            lambdas.len(),
813            closure_table.len()
814        )));
815    }
816    for (slot, lambda) in lambdas.iter().enumerate() {
817        let fv = declare_lambda_function(ctx, module, lambda, slot)?;
818        closure_fn_table.push(fv);
819    }
820
821    // Step 2: emit the entry function body.
822    let (entry_fn, shape) = if is_buffer_protocol_signature(&entry.params, entry.ret) {
823        let fv = emit_buffer_entry_with_helpers_and_closures(
824            ctx,
825            module,
826            entry,
827            buffer_return_size,
828            const_pool,
829            &helper_table,
830            &closure_fn_table,
831            imports,
832            world_mode,
833            target,
834            effectful_imports,
835        )?;
836        (fv, EntryShape::Buffer)
837    } else {
838        // The legacy-i64 entry shape covers hand-built fixtures only; it
839        // never references ConstString and supplies its own empty pool
840        // inside `emit_legacy_entry_impl`.
841        let fv =
842            emit_legacy_entry_with_helpers(ctx, module, entry, &helper_table, imports, world_mode)?;
843        (fv, EntryShape::LegacyI64)
844    };
845
846    // Step 3: emit each helper body now that every callee is declared.
847    for helper in helpers.iter() {
848        let helper_fn = helper_table
849            .values()
850            .find(|fv| {
851                // Locate the FunctionValue by name; cheap enough — the
852                // helper table is tiny and the find runs once per
853                // helper.
854                let expected = format!("relon_helper_{}", helper.name);
855                fv.get_name().to_string_lossy() == expected
856            })
857            .copied()
858            .ok_or_else(|| {
859                LlvmError::Codegen(format!(
860                    "emit_module_funcs: helper `{}` declared but FunctionValue missing",
861                    helper.name
862                ))
863            })?;
864        emit_helper_body(ctx, module, helper, helper_fn, const_pool, &helper_table)?;
865    }
866
867    // Step 4 (Phase F.W7): emit each lambda body. Lambdas share the
868    // `helper_table` so the body can route an inner `Op::Call` to a
869    // sibling helper (Phase E.2 cross-call). They also share the
870    // `closure_fn_table` so a nested `Op::MakeClosure` resolves the
871    // matching lambda FunctionValue from its `fn_table_idx`.
872    //
873    // Build the module-wide self-capture table once before emitting
874    // lambda bodies. The table maps each lambda's `fn_table_idx` to
875    // the captures-struct offsets that hold self-recursive handles
876    // (i.e. handles whose `captures_ptr` field equals the lambda's
877    // own captures_ptr arg). The lambda-body emit uses this table to
878    // stamp [`Provenance::OwnCaptureHandle`] on the matching capture
879    // loads so the recursive call site can pick the direct-call fast
880    // path. Empty for modules that have no self-recursive closures.
881    let self_capture_table = build_self_capture_table(entry, helpers, lambdas);
882    // Devirtualisation (W18): companion table for captures of known
883    // (non-self) closures — lets the W18 predicate's `is_prime` call
884    // devirtualise inside the predicate lambda body.
885    let known_capture_table = build_known_capture_table(entry, helpers, lambdas);
886    for (slot, lambda) in lambdas.iter().enumerate() {
887        let lambda_fn = closure_fn_table[slot];
888        let slot_u32 = slot as u32;
889        let offsets = self_capture_table
890            .get(&slot_u32)
891            .cloned()
892            .unwrap_or_default();
893        let known_offsets = known_capture_table
894            .get(&slot_u32)
895            .cloned()
896            .unwrap_or_default();
897        emit_lambda_body(
898            ctx,
899            module,
900            lambda,
901            lambda_fn,
902            const_pool,
903            &helper_table,
904            &closure_fn_table,
905            &offsets,
906            &known_offsets,
907        )?;
908    }
909
910    Ok((entry_fn, shape, helper_table, closure_fn_table))
911}
912
913/// Phase F.W7 self-recursion fast path: scan every IR function body
914/// (entry + helpers + lambdas) for the canonical
915/// `Op::MakeClosure { fn_table_idx, captures } ; Op::LetSet { idx, ty:
916/// Closure }` pair and collect the captures whose `let_idx` matches the
917/// `LetSet`'s `idx` — those are the self-recursive captures stamped by
918/// `lower_closure_as_value`'s "let-slot not yet bound" branch.
919///
920/// Returns `fn_table_idx -> [(capture_offset, self_fn_table_idx)]` so
921/// the lambda body emitter can stamp the matching
922/// [`Provenance::OwnCaptureHandle`] on each capture load.
923///
924/// The scan tolerates intervening ops between `MakeClosure` and
925/// `LetSet` (none are emitted today; future lowering passes that
926/// interleave additional setup ops would still be matched). It bails
927/// silently on patterns it can't recognise — the fast path stays
928/// opt-in and the slow-path `emit_call_closure` keeps working
929/// regardless.
930fn build_self_capture_table(
931    entry: &Func,
932    helpers: &[&Func],
933    lambdas: &[&Func],
934) -> HashMap<u32, Vec<(u32, u32)>> {
935    let mut table: HashMap<u32, Vec<(u32, u32)>> = HashMap::new();
936
937    let scan = |func: &Func, table: &mut HashMap<u32, Vec<(u32, u32)>>| {
938        let ops = &func.body;
939        for (i, tagged) in ops.iter().enumerate() {
940            // Find a MakeClosure immediately followed by a matching
941            // `LetSet { ty: Closure }`. The IR lowering pass emits
942            // these adjacently (see `lower_anon_dict_body` /
943            // `lower_closure_as_value`); intervening ops break the
944            // simple match and the slow-path dispatch keeps working.
945            let Op::MakeClosure {
946                fn_table_idx,
947                ref captures,
948                ..
949            } = tagged.op
950            else {
951                continue;
952            };
953            let Some(next) = ops.get(i + 1) else {
954                continue;
955            };
956            let Op::LetSet {
957                idx,
958                ty: relon_ir::ir::IrType::Closure,
959            } = next.op
960            else {
961                continue;
962            };
963            for cap in captures {
964                if cap.let_idx == idx && matches!(cap.ty, relon_ir::ir::IrType::Closure) {
965                    table
966                        .entry(fn_table_idx)
967                        .or_default()
968                        .push((cap.offset, fn_table_idx));
969                }
970            }
971        }
972    };
973
974    scan(entry, &mut table);
975    for h in helpers {
976        scan(h, &mut table);
977    }
978    for l in lambdas {
979        scan(l, &mut table);
980    }
981    table
982}
983
984/// Devirtualisation (W18, 2026-05-30): companion to
985/// [`build_self_capture_table`] for *non-self* captures of a closure
986/// whose `fn_table_idx` is a compile-time constant.
987///
988/// Maps each lambda's `fn_table_idx` to the captures-struct offsets that
989/// hold a handle produced by a literal `Op::MakeClosure { K }` (a
990/// *known* closure), together with that `K`. The lambda-body emit uses
991/// this to stamp [`Provenance::KnownClosure`] on the matching capture
992/// load (the prologue `LocalGet(0); LoadI32AtAbsolute { offset };
993/// LetSet { Closure }`), so a `CallClosure` against the capture (e.g.
994/// the W18 predicate's `is_prime(k, 2)` call) emits a direct call
995/// instead of the runtime `switch i32 %cc_fn_idx`.
996///
997/// Soundness: within each function we track, in source order, the
998/// most-recent `MakeClosure { K }; LetSet { idx, Closure }` assignment
999/// per outer let-slot. Any *other* `LetSet { idx, Closure }` clears the
1000/// slot — so a let that is reassigned to a dynamically-chosen closure is
1001/// never recorded as known. A capture is recorded only when its
1002/// `let_idx` resolves to a still-known slot AND the captured `K` differs
1003/// from the capturing lambda `L` (a self-capture, `K == L`, is owned by
1004/// [`build_self_capture_table`], whose `captures_ptr`-reuse fast path is
1005/// strictly better). The lowering pass emits the capturing
1006/// `MakeClosure` only after the captured let is bound and reads the live
1007/// slot, so the tracked `K` is exactly the value the capture holds.
1008fn build_known_capture_table(
1009    entry: &Func,
1010    helpers: &[&Func],
1011    lambdas: &[&Func],
1012) -> HashMap<u32, Vec<(u32, u32)>> {
1013    use relon_ir::ir::IrType as Irt;
1014    let mut table: HashMap<u32, Vec<(u32, u32)>> = HashMap::new();
1015
1016    let scan = |func: &Func, table: &mut HashMap<u32, Vec<(u32, u32)>>| {
1017        let ops = &func.body;
1018        // outer let-slot -> known captured `fn_table_idx`, last-write
1019        // wins; cleared when the slot is reassigned a non-known closure.
1020        let mut known_slots: HashMap<u32, u32> = HashMap::new();
1021        for (i, tagged) in ops.iter().enumerate() {
1022            // Maintain `known_slots` off each `LetSet { idx, Closure }`:
1023            // if the immediately-preceding op is a `MakeClosure { K }`
1024            // (the canonical `MakeClosure; LetSet` binding the lowering
1025            // emits) the slot becomes a *known* closure `K`; any other
1026            // `LetSet { Closure }` stores a value we cannot prove is one
1027            // statically-known closure, so the slot is dropped. Driving
1028            // this off the `LetSet` (rather than the `MakeClosure`)
1029            // avoids the binding `LetSet` clobbering the very entry the
1030            // preceding `MakeClosure` established.
1031            if let Op::LetSet {
1032                idx,
1033                ty: Irt::Closure,
1034            } = tagged.op
1035            {
1036                if let Some(Op::MakeClosure { fn_table_idx, .. }) =
1037                    i.checked_sub(1).and_then(|p| ops.get(p)).map(|t| &t.op)
1038                {
1039                    known_slots.insert(idx, *fn_table_idx);
1040                } else {
1041                    known_slots.remove(&idx);
1042                }
1043                continue;
1044            }
1045            // At a capturing `MakeClosure { L }`, record each capture
1046            // that reads a still-known slot. The capturing closure's own
1047            // handle need NOT be stored to a let — the W18 predicate is
1048            // passed straight into `_list_filter` — because the fact
1049            // recorded here is about lambda `L`'s captures-struct layout
1050            // (offset O holds known closure K), which is fixed by `L`'s
1051            // own `MakeClosure` captures and the known-ness of the
1052            // captured outer let, independent of where `L`'s handle goes.
1053            if let Op::MakeClosure {
1054                fn_table_idx: l_idx,
1055                ref captures,
1056                ..
1057            } = tagged.op
1058            {
1059                for cap in captures {
1060                    if !matches!(cap.ty, Irt::Closure) {
1061                        continue;
1062                    }
1063                    if let Some(&k_idx) = known_slots.get(&cap.let_idx) {
1064                        // `k_idx == l_idx` is a self-capture — owned by
1065                        // `build_self_capture_table`; skip here.
1066                        if k_idx != l_idx {
1067                            table.entry(l_idx).or_default().push((cap.offset, k_idx));
1068                        }
1069                    }
1070                }
1071            }
1072        }
1073    };
1074
1075    scan(entry, &mut table);
1076    for h in helpers {
1077        scan(h, &mut table);
1078    }
1079    for l in lambdas {
1080        scan(l, &mut table);
1081    }
1082    table
1083}
1084
1085/// Devirtualisation (W18) correctness helper: collect every let-slot
1086/// index that a body assigns via `Op::LetSet { ty: Closure }`, recursing
1087/// into nested `Op::If` / `Op::Block` / `Op::Loop` bodies. Used by
1088/// `emit_loop` to conservatively invalidate the `KnownClosure` let-slot
1089/// tracker for any closure slot the loop body reassigns, so a
1090/// cross-iteration read cannot devirtualise to a stale target.
1091fn collect_closure_letset_slots(body: &[TaggedOp], out: &mut Vec<u32>) {
1092    for t in body {
1093        match &t.op {
1094            Op::LetSet {
1095                idx,
1096                ty: relon_ir::ir::IrType::Closure,
1097            } => out.push(*idx),
1098            Op::If {
1099                then_body,
1100                else_body,
1101                ..
1102            } => {
1103                collect_closure_letset_slots(then_body, out);
1104                collect_closure_letset_slots(else_body, out);
1105            }
1106            Op::Block { body, .. } | Op::Loop { body, .. } => {
1107                collect_closure_letset_slots(body, out);
1108            }
1109            _ => {}
1110        }
1111    }
1112}
1113
1114/// Declare a sibling helper function's LLVM signature without emitting
1115/// its body. Used to seat every helper into the module so the entry's
1116/// `Op::Call` lowering can resolve forward references (recursion,
1117/// mutual recursion). Sibling helpers use a plain typed
1118/// `(params...) -> ret` shape — no `*state` pointer, no buffer
1119/// protocol; the test harness drives recursive Int-only functions
1120/// directly. When the IR layer grows first-class closure values
1121/// (Phase F), this signature widens to carry `(*state, captures, ...)`.
1122fn declare_helper_function<'ctx>(
1123    ctx: &'ctx Context,
1124    module: &LlvmModule<'ctx>,
1125    func: &Func,
1126    slot: usize,
1127) -> Result<FunctionValue<'ctx>, LlvmError> {
1128    let mut param_types: Vec<BasicMetadataTypeEnum<'ctx>> = Vec::with_capacity(func.params.len());
1129    for (i, p) in func.params.iter().enumerate() {
1130        let bt = ir_ty_to_llvm_abi(ctx, *p).ok_or_else(|| {
1131            LlvmError::UnsupportedSignature(format!(
1132                "llvm-aot: helper `{}` param #{i} type {p:?} unsupported",
1133                func.name
1134            ))
1135        })?;
1136        param_types.push(basic_to_metadata(bt));
1137    }
1138    let ret_bt = ir_ty_to_llvm_abi(ctx, func.ret).ok_or_else(|| {
1139        LlvmError::UnsupportedSignature(format!(
1140            "llvm-aot: helper `{}` return type {:?} unsupported",
1141            func.name, func.ret
1142        ))
1143    })?;
1144    let fn_type = match ret_bt {
1145        BasicTypeEnum::IntType(t) => t.fn_type(&param_types, false),
1146        BasicTypeEnum::FloatType(t) => t.fn_type(&param_types, false),
1147        BasicTypeEnum::PointerType(t) => t.fn_type(&param_types, false),
1148        other => {
1149            return Err(LlvmError::Codegen(format!(
1150                "llvm-aot: helper `{}` ret BasicType {other:?} unsupported",
1151                func.name
1152            )));
1153        }
1154    };
1155    // Use a deterministic LLVM symbol so the entry's call site can be
1156    // pretty-printed in the IR dump. The slot keeps multiple helpers
1157    // with the same source name (shouldn't happen, but cheap) from
1158    // colliding.
1159    let _ = slot;
1160    let llvm_name = format!("relon_helper_{}", func.name);
1161    let fv = module.add_function(&llvm_name, fn_type, Some(Linkage::Internal));
1162    Ok(fv)
1163}
1164
1165/// Phase F.W7: declare a lambda function's LLVM signature without
1166/// emitting its body. Lambdas always carry the
1167/// `(state: ptr, ...lambda.params) -> ret` signature — the first IR
1168/// param is the captures_ptr the IR lowering pass prepended in
1169/// `lower_closure_as_value`, surfaced through LLVM param 1. Subsequent
1170/// LLVM params correspond to the lambda's user-visible args.
1171///
1172/// The implicit `*state` pointer at LLVM param 0 mirrors the
1173/// buffer-protocol entry's leading state slot so the lambda body's
1174/// `Op::AllocScratch{,Dyn}` / `Op::*AtAbsolute` ops can resolve
1175/// `arena_base` + scratch cursors through the same helper paths the
1176/// entry uses.
1177fn declare_lambda_function<'ctx>(
1178    ctx: &'ctx Context,
1179    module: &LlvmModule<'ctx>,
1180    func: &Func,
1181    slot: usize,
1182) -> Result<FunctionValue<'ctx>, LlvmError> {
1183    let ptr_t = ctx.ptr_type(AddressSpace::default());
1184    let mut param_types: Vec<BasicMetadataTypeEnum<'ctx>> =
1185        Vec::with_capacity(1 + func.params.len());
1186    param_types.push(ptr_t.into());
1187    for (i, p) in func.params.iter().enumerate() {
1188        let bt = ir_ty_to_llvm_abi(ctx, *p).ok_or_else(|| {
1189            LlvmError::UnsupportedSignature(format!(
1190                "llvm-aot: lambda `{}` param #{i} type {p:?} unsupported",
1191                func.name
1192            ))
1193        })?;
1194        param_types.push(basic_to_metadata(bt));
1195    }
1196    let ret_bt = ir_ty_to_llvm_abi(ctx, func.ret).ok_or_else(|| {
1197        LlvmError::UnsupportedSignature(format!(
1198            "llvm-aot: lambda `{}` return type {:?} unsupported",
1199            func.name, func.ret
1200        ))
1201    })?;
1202    let fn_type = match ret_bt {
1203        BasicTypeEnum::IntType(t) => t.fn_type(&param_types, false),
1204        BasicTypeEnum::FloatType(t) => t.fn_type(&param_types, false),
1205        BasicTypeEnum::PointerType(t) => t.fn_type(&param_types, false),
1206        other => {
1207            return Err(LlvmError::Codegen(format!(
1208                "llvm-aot: lambda `{}` ret BasicType {other:?} unsupported",
1209                func.name
1210            )));
1211        }
1212    };
1213    // `relon_lambda_<slot>_<name>` so the emitted IR dump is greppable
1214    // when debugging which `fn_table_idx` mapped to which body.
1215    let llvm_name = format!("relon_lambda_{}_{}", slot, func.name);
1216    let fv = module.add_function(&llvm_name, fn_type, Some(Linkage::Internal));
1217    Ok(fv)
1218}
1219
1220/// Phase E.2: declare the `llvm.trap` intrinsic on `module` if it is
1221/// not already present. The intrinsic has signature `void @llvm.trap()`
1222/// — calling it raises a target-specific trap (a `ud2` on x86-64) that
1223/// the host's `panic` handler can catch when paired with an
1224/// `unreachable`. Cheap to call on every emit pass; we keep the lookup
1225/// idempotent so test fixtures that re-enter the emitter don't end up
1226/// with duplicate declarations.
1227fn declare_llvm_trap<'ctx>(ctx: &'ctx Context, module: &LlvmModule<'ctx>) -> FunctionValue<'ctx> {
1228    if let Some(f) = module.get_function("llvm.trap") {
1229        return f;
1230    }
1231    let void_t = ctx.void_type();
1232    let fn_ty = void_t.fn_type(&[], false);
1233    module.add_function("llvm.trap", fn_ty, None)
1234}
1235
1236/// Phase 0b: declare the `relon_llvm_call_native` host-dispatch helper
1237/// on `module` if absent. Signature mirrors the Rust helper:
1238///
1239/// ```text
1240/// i64 relon_llvm_call_native(ptr state, i32 import_idx,
1241///                            ptr args_ptr, i32 arg_count)
1242/// ```
1243///
1244/// `Linkage::External` so MCJIT resolves it to the host address the
1245/// evaluator registers via `add_global_mapping` (the default resolver
1246/// can't see the static from inside the host dylib's section layout —
1247/// same constraint as the `str.contains` shim). Idempotent so repeated
1248/// emit passes don't duplicate the declaration.
1249fn declare_call_native<'ctx>(ctx: &'ctx Context, module: &LlvmModule<'ctx>) -> FunctionValue<'ctx> {
1250    if let Some(f) = module.get_function(crate::state::RELON_LLVM_CALL_NATIVE_SYMBOL) {
1251        return f;
1252    }
1253    let i64_t = ctx.i64_type();
1254    let i32_t = ctx.i32_type();
1255    let ptr_t = ctx.ptr_type(AddressSpace::default());
1256    let fn_ty = i64_t.fn_type(
1257        &[ptr_t.into(), i32_t.into(), ptr_t.into(), i32_t.into()],
1258        false,
1259    );
1260    module.add_function(
1261        crate::state::RELON_LLVM_CALL_NATIVE_SYMBOL,
1262        fn_ty,
1263        Some(Linkage::External),
1264    )
1265}
1266
1267/// Stage 1.B closed-world: declare a host `#native` fn as an external
1268/// `(i64...) -> i64` so `Op::CallNative` can emit a direct
1269/// `call @<host_symbol>`. Every scalar arg / return rides the i64 lane
1270/// (Bool / I32 zero-extend in; Unit returns `void`), matching the host
1271/// shim's `#[no_mangle] extern "C" fn(i64...) -> i64` ABI the
1272/// co-compile step links in. Idempotent: a repeated import name reuses
1273/// the existing declaration.
1274///
1275/// The lane is deliberately the same i64 width the open-world helper
1276/// decodes, so the two paths are bit-for-bit differential-comparable.
1277fn declare_host_fn_direct<'ctx>(
1278    ctx: &'ctx Context,
1279    module: &LlvmModule<'ctx>,
1280    import: &relon_ir::ir::NativeImport,
1281) -> FunctionValue<'ctx> {
1282    if let Some(f) = module.get_function(&import.name) {
1283        return f;
1284    }
1285    let i64_t = ctx.i64_type();
1286    let params: Vec<BasicMetadataTypeEnum<'ctx>> =
1287        import.param_tys.iter().map(|_| i64_t.into()).collect();
1288    let fn_ty = match import.ret_ty {
1289        IrType::Unit => ctx.void_type().fn_type(&params, false),
1290        _ => i64_t.fn_type(&params, false),
1291    };
1292    module.add_function(&import.name, fn_ty, Some(Linkage::External))
1293}
1294
1295/// #359 (W20): map an [`IrType`] to the LLVM type used in a helper /
1296/// lambda **call ABI** slot. This mirrors the operand-stack
1297/// convention where `F64` rides as its 64-bit *bit pattern* in an i64
1298/// register: `F64` maps to `i64`, not `double`. Keeping the ABI int-
1299/// only means a `CallClosure` / `Op::Call` site never has to bitcast
1300/// between the i64-bits stack representation and a native-float
1301/// argument / return slot — the value flows through verbatim. The
1302/// W20 n-body helpers (`pair_force` / `accel` return `F64`,
1303/// `pair_force` takes an `F64` mass) are the first closures with a
1304/// Float in their signature; without this they'd declare a `double`
1305/// slot that the i64-bits operand stack cannot feed.
1306fn ir_ty_to_llvm_abi<'ctx>(ctx: &'ctx Context, ty: IrType) -> Option<BasicTypeEnum<'ctx>> {
1307    match ty {
1308        IrType::I64 | IrType::F64 => Some(ctx.i64_type().into()),
1309        IrType::I32 | IrType::Bool | IrType::Unit => Some(ctx.i32_type().into()),
1310        IrType::String
1311        | IrType::ListInt
1312        | IrType::ListFloat
1313        | IrType::ListBool
1314        | IrType::ListString
1315        | IrType::ListSchema
1316        | IrType::ListList
1317        | IrType::Closure
1318        | IrType::Dict => Some(ctx.i32_type().into()),
1319    }
1320}
1321
1322fn basic_to_metadata(bt: BasicTypeEnum<'_>) -> BasicMetadataTypeEnum<'_> {
1323    match bt {
1324        BasicTypeEnum::IntType(t) => t.into(),
1325        BasicTypeEnum::FloatType(t) => t.into(),
1326        BasicTypeEnum::PointerType(t) => t.into(),
1327        BasicTypeEnum::ArrayType(t) => t.into(),
1328        BasicTypeEnum::StructType(t) => t.into(),
1329        BasicTypeEnum::VectorType(t) => t.into(),
1330        BasicTypeEnum::ScalableVectorType(t) => t.into(),
1331    }
1332}
1333
1334/// Lower a sibling helper's body against its declared LLVM
1335/// `FunctionValue`. Mirrors [`emit_legacy_entry`] but without enforcing
1336/// the legacy-i64 envelope — helpers may carry any
1337/// [`IrType`]-shaped param / return mix that `ir_ty_to_llvm_abi`
1338/// accepts.
1339fn emit_helper_body<'ctx>(
1340    ctx: &'ctx Context,
1341    module: &LlvmModule<'ctx>,
1342    func: &Func,
1343    llvm_fn: FunctionValue<'ctx>,
1344    const_pool: &ConstPool,
1345    helper_table: &HashMap<u32, FunctionValue<'ctx>>,
1346) -> Result<(), LlvmError> {
1347    let entry_bb = ctx.append_basic_block(llvm_fn, "entry");
1348    let builder = ctx.create_builder();
1349    builder.position_at_end(entry_bb);
1350
1351    let mut emit = Emit::new(
1352        ctx,
1353        &builder,
1354        module,
1355        llvm_fn,
1356        EntryShape::LegacyI64,
1357        /*arena_base_ptr=*/ None,
1358        /*state_ptr=*/ None,
1359        /*buffer_return_size=*/ 0,
1360        const_pool,
1361    );
1362    // Helper functions have no implicit state slot; `LocalGet(0)` maps
1363    // straight to LLVM param 0.
1364    emit.param_base = 0;
1365    emit.helper_table = Some(helper_table.clone());
1366    // Record the IR-declared return type so `Op::Return` knows what to
1367    // widen / truncate to when the operand stack value's width differs
1368    // from the LLVM signature's return slot.
1369    emit.helper_ret_ty = Some(func.ret);
1370    emit.llvm_trap_fn = Some(declare_llvm_trap(ctx, module));
1371    emit.let_floor = relon_ir::ir::body_let_watermark(&func.body);
1372    emit.lower_body(&func.body)?;
1373    Ok(())
1374}
1375
1376/// Phase F.W7: emit a lambda body. Mirrors [`emit_helper_body`] but:
1377///
1378/// - The first LLVM param (`*state`) is materialised into
1379///   `arena_base_ptr` + `state_ptr` so the body's
1380///   `Op::AllocScratch{,Dyn}` / `Op::*AtAbsolute` ops resolve against
1381///   the per-call arena state. Required because lambdas read captures
1382///   via `LocalGet(0); LoadI32AtAbsolute { offset }` against the
1383///   captures struct in scratch.
1384/// - `param_base = 1` so the IR's `LocalGet(idx)` skips the implicit
1385///   state slot — `LocalGet(0)` therefore reads the captures_ptr at
1386///   LLVM param 1, matching what the IR lowering pass laid out in
1387///   `lower_closure_as_value`.
1388/// - The closure table is threaded through so nested
1389///   `Op::MakeClosure` / `Op::CallClosure` ops inside the lambda body
1390///   keep resolving against the same module-wide lambda set the entry
1391///   uses.
1392#[allow(clippy::too_many_arguments)]
1393fn emit_lambda_body<'ctx>(
1394    ctx: &'ctx Context,
1395    module: &LlvmModule<'ctx>,
1396    func: &Func,
1397    llvm_fn: FunctionValue<'ctx>,
1398    const_pool: &ConstPool,
1399    helper_table: &HashMap<u32, FunctionValue<'ctx>>,
1400    closure_fn_table: &[FunctionValue<'ctx>],
1401    self_capture_offsets: &[(u32, u32)],
1402    known_capture_offsets: &[(u32, u32)],
1403) -> Result<(), LlvmError> {
1404    let entry_bb = ctx.append_basic_block(llvm_fn, "entry");
1405    let builder = ctx.create_builder();
1406    builder.position_at_end(entry_bb);
1407
1408    // Materialise `state_ptr` + `arena_base_ptr` at function entry.
1409    // Same pointer-arithmetic shape the buffer entry uses — the lambda
1410    // shares the per-call `ArenaState` layout because the host (the
1411    // entry function or another lambda) passes its own state pointer
1412    // through to the call indirect site verbatim.
1413    let i32_t = ctx.i32_type();
1414    let i64_t = ctx.i64_type();
1415    let i8_t = ctx.i8_type();
1416    let ptr_t = ctx.ptr_type(AddressSpace::default());
1417    let state_param = llvm_fn
1418        .get_nth_param(0)
1419        .ok_or_else(|| LlvmError::Codegen(format!("lambda `{}` missing state param", func.name)))?
1420        .into_pointer_value();
1421    let arena_base_gep = unsafe {
1422        builder
1423            .build_in_bounds_gep(
1424                i8_t,
1425                state_param,
1426                &[i32_t.const_int(ARENA_STATE_OFFSET_BASE as u64, false)],
1427                "lambda_arena_base_gep",
1428            )
1429            .map_err(|e| LlvmError::Codegen(format!("lambda arena_base GEP: {e}")))?
1430    };
1431    // TODO(P3-wasm32): use DataLayout pointer width instead of i64
1432    // for the arena-base word load + inttoptr below.
1433    let arena_base_load = builder
1434        .build_load(i64_t, arena_base_gep, "lambda_arena_base")
1435        .map_err(|e| LlvmError::Codegen(format!("lambda arena_base load: {e}")))?;
1436    mark_invariant_load(ctx, arena_base_load);
1437    let arena_base_int = arena_base_load.into_int_value();
1438    let arena_base_ptr = builder
1439        .build_int_to_ptr(arena_base_int, ptr_t, "lambda_arena_base_ptr")
1440        .map_err(|e| LlvmError::Codegen(format!("lambda arena_base inttoptr: {e}")))?;
1441
1442    // Stash the captures_ptr LLVM param (param 1) so the self-recursion
1443    // fast path in `emit_call_closure` can reuse it directly instead
1444    // of round-tripping through a `captures_ptr` field load on every
1445    // recursion. The lambda signature pins this to LLVM param 1 (param
1446    // 0 is `*state`) — see `declare_lambda_function`.
1447    let captures_ptr_param = llvm_fn
1448        .get_nth_param(1)
1449        .ok_or_else(|| {
1450            LlvmError::Codegen(format!("lambda `{}` missing captures_ptr param", func.name))
1451        })?
1452        .into_int_value();
1453
1454    let mut emit = Emit::new(
1455        ctx,
1456        &builder,
1457        module,
1458        llvm_fn,
1459        EntryShape::LegacyI64,
1460        Some(arena_base_ptr),
1461        Some(state_param),
1462        /*buffer_return_size=*/ 0,
1463        const_pool,
1464    );
1465    // LLVM param 0 is `*state`; the IR's params (including the
1466    // implicit captures_ptr at IR index 0) start at LLVM param 1.
1467    emit.param_base = 1;
1468    emit.helper_table = Some(helper_table.clone());
1469    emit.closure_fn_table = closure_fn_table.to_vec();
1470    // The lambda body's `Op::Return` carries the IR-declared return
1471    // type so the dispatcher knows what LLVM `ret` shape to emit.
1472    emit.helper_ret_ty = Some(func.ret);
1473    emit.llvm_trap_fn = Some(declare_llvm_trap(ctx, module));
1474    emit.self_capture_offsets = self_capture_offsets.to_vec();
1475    emit.known_capture_offsets = known_capture_offsets.to_vec();
1476    emit.captures_ptr_param = Some(captures_ptr_param);
1477    emit.let_floor = relon_ir::ir::body_let_watermark(&func.body);
1478    emit.lower_body(&func.body)?;
1479    Ok(())
1480}
1481
1482/// Phase D.1: emit a typed `(i64, i64, ...) -> i64` fast entry
1483/// alongside the buffer-protocol entry. Reuses the IR body's op
1484/// stream but rewrites every buffer-protocol `LoadField` into a
1485/// direct LLVM param read (via `profile.arg_offsets`) and every
1486/// trailing `StoreField` at the return-value offset into a `ret`
1487/// against the stashed value.
1488///
1489/// Returns `Err` when the IR contains ops outside the fast-path
1490/// envelope (string ops, sandbox traps, pointer-indirect StoreField,
1491/// stdlib calls — anything that escapes the simple Int-arithmetic
1492/// loop). The evaluator side surfaces this as "fast path unavailable;
1493/// fall back to the buffer entry" rather than a hard error so adding
1494/// more workloads doesn't risk regressing the buffer path.
1495pub(crate) fn emit_fast_entry<'ctx>(
1496    ctx: &'ctx Context,
1497    module: &LlvmModule<'ctx>,
1498    func: &Func,
1499    profile: &FastPathProfile,
1500    helper_table: &HashMap<u32, FunctionValue<'ctx>>,
1501    closure_fn_table: &[FunctionValue<'ctx>],
1502) -> Result<FunctionValue<'ctx>, LlvmError> {
1503    if !is_buffer_protocol_signature(&func.params, func.ret) {
1504        return Err(LlvmError::UnsupportedSignature(
1505            "fast-path entry requires buffer-protocol IR".into(),
1506        ));
1507    }
1508    let arity = profile.arg_offsets.len();
1509    if arity > 8 {
1510        // Cap at 8 to keep the typed dispatch table in evaluator.rs
1511        // finite. Sources with arity > 8 stay on the buffer path —
1512        // their boundary cost is amortised across more work anyway.
1513        return Err(LlvmError::UnsupportedSignature(format!(
1514            "fast-path entry: arity {arity} exceeds cap of 8"
1515        )));
1516    }
1517
1518    let i64_t = ctx.i64_type();
1519    let param_types: Vec<BasicMetadataTypeEnum<'ctx>> = (0..arity).map(|_| i64_t.into()).collect();
1520    let fn_type = i64_t.fn_type(&param_types, false);
1521    let llvm_fn = module.add_function(ENTRY_SYMBOL_FAST, fn_type, None);
1522
1523    let entry_bb = ctx.append_basic_block(llvm_fn, "fast_entry");
1524    let builder = ctx.create_builder();
1525    builder.position_at_end(entry_bb);
1526
1527    // Reserve an alloca for the return value. The fast emitter
1528    // rewrites the trailing `StoreField` / `StoreFieldAtRecord` at
1529    // the return slot (which under buffer protocol writes the i64
1530    // result into the arena) to a store into this slot; the implicit
1531    // `Op::Return` at end-of-body loads from the slot and `ret`s it.
1532    // Placing the alloca in the entry block lets LLVM's mem2reg
1533    // promote it to SSA across the loop boundary.
1534    let ret_slot = builder
1535        .build_alloca(i64_t, "fast_ret_slot")
1536        .map_err(|e| LlvmError::Codegen(format!("fast ret_slot alloca: {e}")))?;
1537    // Initialise to 0 so any early `Op::Return` (no value path) still
1538    // produces a defined value — matches the buffer entry's
1539    // "ret root_size when no scalar stored" envelope.
1540    builder
1541        .build_store(ret_slot, i64_t.const_zero())
1542        .map_err(|e| LlvmError::Codegen(format!("fast ret_slot init: {e}")))?;
1543
1544    // The fast entry is a typed `(i64...) -> i64` shape derived from
1545    // the buffer-protocol IR after the dispatch-boundary rewrite. It
1546    // doesn't touch the const-data pool (the IR only contains scalar
1547    // arithmetic ops) so we hand it an empty pool to keep
1548    // `Emit::new` polymorphic.
1549    let empty_pool = ConstPool::default();
1550    let mut emit = Emit::new(
1551        ctx,
1552        &builder,
1553        module,
1554        llvm_fn,
1555        EntryShape::LegacyI64,
1556        /*arena_base_ptr=*/ None,
1557        /*state_ptr=*/ None,
1558        /*buffer_return_size=*/ 0,
1559        &empty_pool,
1560    );
1561    emit.fast_path = Some(FastEmit {
1562        profile: profile.clone(),
1563        ret_slot,
1564    });
1565    // LLVM param i corresponds to arg i — no implicit state slot for
1566    // the fast entry. `LocalGet` should never appear in the body
1567    // because the IR producer only emits LocalGet for the handshake
1568    // params (which the fast path doesn't pass).
1569    emit.param_base = 0;
1570    emit.llvm_trap_fn = Some(declare_llvm_trap(ctx, module));
1571    // Phase D.2: plumb the module-wide helper and closure tables so
1572    // an in-body `Op::Call` / `Op::MakeClosure` / `Op::CallClosure`
1573    // can resolve sibling functions. The fast emitter's per-op rewrites
1574    // (`MakeClosure` → virtualised closure, `CallClosure` → direct
1575    // call with null state/captures) consult these tables to pick the
1576    // matching `FunctionValue`.
1577    emit.helper_table = Some(helper_table.clone());
1578    emit.closure_fn_table = closure_fn_table.to_vec();
1579    emit.let_floor = relon_ir::ir::body_let_watermark(&func.body);
1580    emit.lower_body(&func.body)?;
1581
1582    // The buffer-protocol IR ends with `Op::Return` which the fast
1583    // emitter rewrote into a load+ret. If the body fell through
1584    // without an explicit Return (shouldn't happen for well-formed
1585    // `#main` IR, but be defensive), seal it with a load+ret.
1586    if let Some(cur) = builder.get_insert_block() {
1587        if cur.get_terminator().is_none() {
1588            let v = builder
1589                .build_load(i64_t, ret_slot, "fast_ret_load")
1590                .map_err(|e| LlvmError::Codegen(format!("fast trailing load: {e}")))?
1591                .into_int_value();
1592            builder
1593                .build_return(Some(&v))
1594                .map_err(|e| LlvmError::Codegen(format!("fast trailing ret: {e}")))?;
1595        }
1596    }
1597
1598    Ok(llvm_fn)
1599}
1600
1601// ---------------------------------------------------------------------------
1602// Legacy-i64 entry (Phase A bootstrap envelope, retained for tests)
1603// ---------------------------------------------------------------------------
1604
1605fn emit_legacy_entry_with_helpers<'ctx>(
1606    ctx: &'ctx Context,
1607    module: &LlvmModule<'ctx>,
1608    func: &Func,
1609    helper_table: &HashMap<u32, FunctionValue<'ctx>>,
1610    imports: &[relon_ir::ir::NativeImport],
1611    world_mode: WorldMode,
1612) -> Result<FunctionValue<'ctx>, LlvmError> {
1613    emit_legacy_entry_impl(ctx, module, func, Some(helper_table), imports, world_mode)
1614}
1615
1616/// Emit a Phase-A `(I64...) -> I64` function. Used by tests + the
1617/// Phase A bootstrap benchmarks that exercise the hand-built IR
1618/// fixtures directly (no buffer-protocol wrapping).
1619fn emit_legacy_entry_impl<'ctx>(
1620    ctx: &'ctx Context,
1621    module: &LlvmModule<'ctx>,
1622    func: &Func,
1623    helper_table: Option<&HashMap<u32, FunctionValue<'ctx>>>,
1624    imports: &[relon_ir::ir::NativeImport],
1625    world_mode: WorldMode,
1626) -> Result<FunctionValue<'ctx>, LlvmError> {
1627    for (i, p) in func.params.iter().enumerate() {
1628        if *p != IrType::I64 {
1629            return Err(LlvmError::UnsupportedSignature(format!(
1630                "llvm-aot: legacy-i64 envelope expects I64 param at #{i}, got {p:?}"
1631            )));
1632        }
1633    }
1634    if func.ret != IrType::I64 {
1635        return Err(LlvmError::UnsupportedSignature(format!(
1636            "llvm-aot: legacy-i64 envelope expects I64 return, got {:?}",
1637            func.ret
1638        )));
1639    }
1640
1641    let i64_t = ctx.i64_type();
1642    let param_types: Vec<BasicMetadataTypeEnum<'ctx>> =
1643        (0..func.params.len()).map(|_| i64_t.into()).collect();
1644    let fn_type = i64_t.fn_type(&param_types, false);
1645    let llvm_fn = module.add_function(ENTRY_SYMBOL, fn_type, None);
1646
1647    let entry_bb = ctx.append_basic_block(llvm_fn, "entry");
1648    let builder = ctx.create_builder();
1649    builder.position_at_end(entry_bb);
1650
1651    // Legacy-i64 entry shape only consumes the hand-built fixtures
1652    // (helloworld_arith) which never reference ConstString — an empty
1653    // pool is enough.
1654    let empty_pool = ConstPool::default();
1655    let mut emit = Emit::new(
1656        ctx,
1657        &builder,
1658        module,
1659        llvm_fn,
1660        EntryShape::LegacyI64,
1661        None,
1662        None,
1663        /*buffer_return_size=*/ 0,
1664        &empty_pool,
1665    );
1666    // Param order under the legacy envelope: every IR LocalGet(i)
1667    // maps to llvm_fn.param(i) — no implicit state slot.
1668    emit.param_base = 0;
1669    if let Some(table) = helper_table {
1670        emit.helper_table = Some(table.clone());
1671    }
1672    emit.llvm_trap_fn = Some(declare_llvm_trap(ctx, module));
1673    // Stage 1.B: closed-world legacy entry threads the `#native` import
1674    // table + pre-declares each host fn as an `extern` so `CallNative`
1675    // emits a direct `call @<host_symbol>` (no state pointer needed).
1676    // The open-world legacy path keeps `imports` empty (the legacy
1677    // fixtures never carry a `CallNative`).
1678    emit.imports = imports;
1679    emit.world_mode = world_mode;
1680    if matches!(world_mode, WorldMode::ClosedWorld) {
1681        for import in imports {
1682            declare_host_fn_direct(ctx, module, import);
1683        }
1684    }
1685    emit.let_floor = relon_ir::ir::body_let_watermark(&func.body);
1686    emit.lower_body(&func.body)?;
1687
1688    Ok(llvm_fn)
1689}
1690
1691// ---------------------------------------------------------------------------
1692// Buffer-protocol entry (Phase B production envelope)
1693// ---------------------------------------------------------------------------
1694
1695// Retained for symmetry with `emit_legacy_entry_with_helpers`; the
1696// Phase F.W7 emit path always routes through
1697// `emit_buffer_entry_with_helpers_and_closures` so a closure-free
1698// module still gets the new entry shape (with an empty closure
1699// table). Marked `#[allow(dead_code)]` to keep the symmetric pair
1700// visible without firing the unused-function lint.
1701#[allow(dead_code)]
1702fn emit_buffer_entry_with_helpers<'ctx>(
1703    ctx: &'ctx Context,
1704    module: &LlvmModule<'ctx>,
1705    func: &Func,
1706    buffer_return_size: u32,
1707    const_pool: &ConstPool,
1708    helper_table: &HashMap<u32, FunctionValue<'ctx>>,
1709) -> Result<FunctionValue<'ctx>, LlvmError> {
1710    emit_buffer_entry_impl(
1711        ctx,
1712        module,
1713        func,
1714        buffer_return_size,
1715        const_pool,
1716        Some(helper_table),
1717        &[],
1718        &[],
1719        WorldMode::OpenWorld,
1720        crate::CodegenTarget::Native,
1721        &[],
1722    )
1723}
1724
1725/// Phase F.W7 variant: same as [`emit_buffer_entry_with_helpers`] but
1726/// also threads the closure function-pointer table into the entry's
1727/// `Emit` so the body's `Op::MakeClosure` lowering can stamp the
1728/// matching `fn_table_idx` into the closure handle.
1729#[allow(clippy::too_many_arguments)]
1730fn emit_buffer_entry_with_helpers_and_closures<'ctx, 'cp>(
1731    ctx: &'ctx Context,
1732    module: &LlvmModule<'ctx>,
1733    func: &Func,
1734    buffer_return_size: u32,
1735    const_pool: &'cp ConstPool,
1736    helper_table: &HashMap<u32, FunctionValue<'ctx>>,
1737    closure_fn_table: &[FunctionValue<'ctx>],
1738    imports: &'cp [relon_ir::ir::NativeImport],
1739    world_mode: WorldMode,
1740    target: crate::CodegenTarget,
1741    effectful_imports: &'cp [bool],
1742) -> Result<FunctionValue<'ctx>, LlvmError> {
1743    emit_buffer_entry_impl(
1744        ctx,
1745        module,
1746        func,
1747        buffer_return_size,
1748        const_pool,
1749        Some(helper_table),
1750        closure_fn_table,
1751        imports,
1752        world_mode,
1753        target,
1754        effectful_imports,
1755    )
1756}
1757
1758/// Emit the buffer-protocol entry function. The cranelift backend's
1759/// equivalent lives in `relon-codegen-cranelift::codegen::mod.rs` —
1760/// signature mirrored here so a host that holds either evaluator
1761/// can dispatch through the same `(state, in_ptr, …)` argv shape.
1762#[allow(clippy::too_many_arguments)]
1763fn emit_buffer_entry_impl<'ctx, 'cp>(
1764    ctx: &'ctx Context,
1765    module: &LlvmModule<'ctx>,
1766    func: &Func,
1767    buffer_return_size: u32,
1768    const_pool: &'cp ConstPool,
1769    helper_table: Option<&HashMap<u32, FunctionValue<'ctx>>>,
1770    closure_fn_table: &[FunctionValue<'ctx>],
1771    imports: &'cp [relon_ir::ir::NativeImport],
1772    world_mode: WorldMode,
1773    target: crate::CodegenTarget,
1774    effectful_imports: &'cp [bool],
1775) -> Result<FunctionValue<'ctx>, LlvmError> {
1776    let i32_t = ctx.i32_type();
1777    let i64_t = ctx.i64_type();
1778    let ptr_t = ctx.ptr_type(AddressSpace::default());
1779
1780    // (*state, i32 in_ptr, i32 in_len, i32 out_ptr, i32 out_cap, i64 caps) -> i32
1781    let param_types: Vec<BasicMetadataTypeEnum<'ctx>> = vec![
1782        ptr_t.into(),
1783        i32_t.into(),
1784        i32_t.into(),
1785        i32_t.into(),
1786        i32_t.into(),
1787        i64_t.into(),
1788    ];
1789    let fn_type = i32_t.fn_type(&param_types, false);
1790    let llvm_fn = module.add_function(ENTRY_SYMBOL, fn_type, None);
1791
1792    let entry_bb = ctx.append_basic_block(llvm_fn, "entry");
1793    let builder = ctx.create_builder();
1794    builder.position_at_end(entry_bb);
1795
1796    // Resolve the per-call arena base once at function entry. The
1797    // LoadField / StoreField helpers consume this cached value so
1798    // the JIT doesn't reload `state->arena_base` on every access.
1799    let state_param = llvm_fn
1800        .get_nth_param(0)
1801        .ok_or_else(|| LlvmError::Codegen("buffer entry missing state param".into()))?
1802        .into_pointer_value();
1803
1804    // Pointer arithmetic on the state struct: GEP by ARENA_STATE_OFFSET_BASE
1805    // bytes through an i8 view, then load the `usize` arena base.
1806    // We use opaque pointers so the GEP element type only matters
1807    // for the offset calculation.
1808    let i8_t = ctx.i8_type();
1809    let arena_base_gep = unsafe {
1810        builder
1811            .build_in_bounds_gep(
1812                i8_t,
1813                state_param,
1814                &[i32_t.const_int(ARENA_STATE_OFFSET_BASE as u64, false)],
1815                "arena_base_gep",
1816            )
1817            .map_err(|e| LlvmError::Codegen(format!("arena_base GEP: {e}")))?
1818    };
1819    // `arena_base` is `usize`. On every supported host that's i64
1820    // (we only target x86_64 today; the inkwell feature set in the
1821    // Cargo.toml is `target-x86`). If we add a 32-bit host the
1822    // load type needs to follow `pointer_type` width — Phase B
1823    // assumes the workspace's only target is 64-bit.
1824    // TODO(P3-wasm32): use DataLayout pointer width instead of i64
1825    // for the arena-base word load + inttoptr below.
1826    let arena_base_load = builder
1827        .build_load(i64_t, arena_base_gep, "arena_base")
1828        .map_err(|e| LlvmError::Codegen(format!("arena_base load: {e}")))?;
1829    mark_invariant_load(ctx, arena_base_load);
1830    let arena_base_int = arena_base_load.into_int_value();
1831    let arena_base_ptr = builder
1832        .build_int_to_ptr(arena_base_int, ptr_t, "arena_base_ptr")
1833        .map_err(|e| LlvmError::Codegen(format!("arena_base inttoptr: {e}")))?;
1834
1835    // Phase E.1 prologue: init `state.tail_cursor = buffer_return_size`
1836    // so the first pointer-indirect StoreField lands past the fixed
1837    // area. Cheap (one store per call) — keeping it unconditional
1838    // avoids a body pre-scan. Bodies that never touch the tail
1839    // cursor pay the dead store; mem2reg / DSE eliminate it at -O3.
1840    let tail_init_gep = unsafe {
1841        builder
1842            .build_in_bounds_gep(
1843                i8_t,
1844                state_param,
1845                &[i32_t.const_int(u64::from(ARENA_STATE_OFFSET_TAIL_CURSOR), false)],
1846                "tail_cursor_init_gep",
1847            )
1848            .map_err(|e| LlvmError::Codegen(format!("tail_cursor init GEP: {e}")))?
1849    };
1850    let tail_init = i32_t.const_int(u64::from(buffer_return_size), false);
1851    builder
1852        .build_store(tail_init_gep, tail_init)
1853        .map_err(|e| LlvmError::Codegen(format!("tail_cursor init store: {e}")))?;
1854
1855    let mut emit = Emit::new(
1856        ctx,
1857        &builder,
1858        module,
1859        llvm_fn,
1860        EntryShape::Buffer,
1861        Some(arena_base_ptr),
1862        Some(state_param),
1863        buffer_return_size,
1864        const_pool,
1865    );
1866    // Buffer-protocol LocalGet(0..=3) reads the four i32 handshake
1867    // slots; LocalGet(4) reads the i64 `caps` slot. The state
1868    // pointer occupies slot 0 in the LLVM function — IR locals
1869    // start at +1 from there.
1870    emit.param_base = 1;
1871    if let Some(table) = helper_table {
1872        emit.helper_table = Some(table.clone());
1873    }
1874    emit.closure_fn_table = closure_fn_table.to_vec();
1875    emit.llvm_trap_fn = Some(declare_llvm_trap(ctx, module));
1876    // Phase 0b: thread the `#native` import table through so
1877    // `Op::CallNative` can validate the call shape.
1878    emit.imports = imports;
1879    emit.world_mode = world_mode;
1880    emit.target = target;
1881    emit.effectful_imports = effectful_imports;
1882    match world_mode {
1883        // Open-world (MCJIT / from_source): declare the dynamic-dispatch
1884        // helper so `Op::CallNative` emits a `call @relon_llvm_call_native`
1885        // that `add_global_mapping` later resolves to the host address.
1886        //
1887        // P3 §2.2: the wasm32 target has no MCJIT engine to patch the
1888        // helper symbol in — declaring it would leave an unresolvable
1889        // native import. The wasm path instead lowers each
1890        // `Op::CallNative` to a direct **wasm import** call
1891        // (`emit_call_native_wasi`), declaring the import lazily at the
1892        // call site, so we skip the helper declaration here.
1893        WorldMode::OpenWorld if matches!(target, crate::CodegenTarget::Wasm32) => {
1894            emit.call_native_fn = None;
1895        }
1896        WorldMode::OpenWorld => {
1897            emit.call_native_fn = Some(declare_call_native(ctx, module));
1898        }
1899        // Closed-world (Stage 1.B LTO co-compile): pre-declare every
1900        // host fn as an `extern` so `Op::CallNative` can emit a direct
1901        // `call @<host_symbol>`. The host bitcode is linked + inlined by
1902        // `crate::cocompile`. No `relon_llvm_call_native` helper exists
1903        // on this path.
1904        //
1905        // P3 §2.2 wasm closed-world: only pre-declare the **pure-compute**
1906        // host fns as direct externs (those get co-compiled + inlined).
1907        // An **effectful** host fn must NOT be inlined into the sandbox —
1908        // its `Op::CallNative` routes to `emit_call_native_wasi`, which
1909        // declares the `(import "env" …)` lazily. Pre-declaring it here as
1910        // a plain extern would still be link-resolved by the inlined-shim,
1911        // defeating the boundary, so we skip effectful imports.
1912        WorldMode::ClosedWorld => {
1913            emit.call_native_fn = None;
1914            for (idx, import) in imports.iter().enumerate() {
1915                let effectful = effectful_imports.get(idx).copied().unwrap_or(false);
1916                if !effectful {
1917                    declare_host_fn_direct(ctx, module, import);
1918                }
1919            }
1920        }
1921    }
1922    emit.let_floor = relon_ir::ir::body_let_watermark(&func.body);
1923    emit.emit_step_budget_check("entry")?;
1924    emit.lower_body(&func.body)?;
1925
1926    Ok(llvm_fn)
1927}
1928
1929// ---------------------------------------------------------------------------
1930// Per-function emitter state
1931// ---------------------------------------------------------------------------
1932
1933/// Per-function emitter state. Holds the inkwell builder borrow,
1934/// the LLVM function the emit targets, the IR's operand stack, and
1935/// the alloca slots backing `LetSet` / `LetGet`.
1936///
1937/// `param_base` accounts for the entry-shape's implicit param slot:
1938/// the buffer-protocol entry has the `*state` pointer at LLVM param
1939/// 0, so `LocalGet(0)` resolves to LLVM param 1. The legacy-i64
1940/// entry has no implicit slot, so `param_base = 0`.
1941pub(crate) struct Emit<'ctx, 'b, 'cp> {
1942    pub(crate) ctx: &'ctx Context,
1943    pub(crate) builder: &'b Builder<'ctx>,
1944    pub(crate) func: FunctionValue<'ctx>,
1945    /// Phase F.1: cached module reference so per-op lowering can
1946    /// declare extern symbols (the F.1 `str.contains` host shim) on
1947    /// demand without threading the module through every helper. The
1948    /// reference is borrowed for the emit pass only; `inkwell` keeps
1949    /// `Module` and `FunctionValue` lifetimes orthogonal so a borrow
1950    /// here doesn't conflict with the surrounding `add_function`
1951    /// calls in the entry/helper emit paths.
1952    pub(crate) module: &'b LlvmModule<'ctx>,
1953    pub(crate) shape: EntryShape,
1954    /// Cached `arena_base` pointer for the buffer-protocol entry.
1955    /// `None` for the legacy entry shape — `LoadField` / `StoreField`
1956    /// reject themselves before reaching for this value.
1957    pub(crate) arena_base_ptr: Option<PointerValue<'ctx>>,
1958    /// Cached state-pointer LLVM value (param 0 of the buffer entry).
1959    /// Phase E.1 uses it to load / store the per-call tail-cursor /
1960    /// scratch-cursor / scratch-base slots. `None` outside the
1961    /// buffer-protocol entry shape.
1962    pub(crate) state_ptr: Option<PointerValue<'ctx>>,
1963    /// Operand stack mirroring the IR's virtual stack. Every value
1964    /// in flight is an LLVM integer of the matching IR type. The
1965    /// pair tags the IR type so consumers can pick the right
1966    /// signed / unsigned predicate without re-deriving it.
1967    pub(crate) stack: Vec<TypedValue<'ctx>>,
1968    /// `LetSet { idx }` alloca slots, keyed by `(idx, ty)`. Each
1969    /// idx has at most one type at a time — the IR lowering pass
1970    /// guarantees no aliasing between idx's of different types.
1971    pub(crate) let_slots: std::collections::HashMap<u32, (PointerValue<'ctx>, IrType)>,
1972    /// Static let-index floor for stdlib inline-frame windows: the
1973    /// function body's [`relon_ir::ir::body_let_watermark`], i.e. one
1974    /// past the highest let index the body (recursively) touches.
1975    /// `emit_call_stdlib` places each inline window at
1976    /// `max(declared-slots max + 1, let_floor)` so callee lets never
1977    /// collide with caller lets that are first bound *after* the
1978    /// inlined call. While a frame is active the floor is raised past
1979    /// the callee body's own watermark (and restored on frame pop) so
1980    /// nested inlines stay collision-free too.
1981    pub(crate) let_floor: u32,
1982    /// LLVM param offset corresponding to `LocalGet(0)`. See
1983    /// [`Self::lookup_param`] — `param_base + idx` is the LLVM
1984    /// param index.
1985    pub(crate) param_base: u32,
1986    /// Label stack carrying the (entry_bb, exit_bb, kind) of every
1987    /// nested [`Op::Block`] / [`Op::Loop`]. `Br { label_depth }`
1988    /// indexes from the back (depth 0 = innermost). `Block`s exit
1989    /// to their tail; `Loop`s exit to their head.
1990    pub(crate) label_stack: Vec<LabelFrame<'ctx>>,
1991    /// Monotonic counter to mint unique LLVM basic block / value
1992    /// names so the dumped IR is human-readable.
1993    pub(crate) name_seq: u32,
1994    /// Phase B: hard-coded `return_root_size` returned from a
1995    /// buffer-protocol `Op::Return`. The IR producer leaves no
1996    /// value on the operand stack for `Return` under buffer
1997    /// protocol — the trampoline reads back `bytes_written` to
1998    /// decode the output record. We hard-code this to the schema's
1999    /// `return_layout.root_size`, passed in at emit time.
2000    pub(crate) buffer_return_size: u32,
2001    /// Phase D.1: set when emitting the fast-path entry. The
2002    /// `Op::LoadField` / `Op::StoreField` / `Op::Return` lowering
2003    /// branches consult this to rewrite the buffer-protocol IR
2004    /// against the typed `(i64...) -> i64` LLVM signature.
2005    pub(crate) fast_path: Option<FastEmit<'ctx>>,
2006    /// Phase E.2 multi-function lookup: when populated, `Op::Call`
2007    /// with `fn_index >= stdlib_function_count()` resolves to the
2008    /// matching sibling `FunctionValue` and emits a direct LLVM
2009    /// `call`. The map is keyed by IR-side `funcs` index (i.e.
2010    /// `fn_index - stdlib_count`). Empty for hand-built fixtures that
2011    /// never reference user-defined functions.
2012    pub(crate) helper_table: Option<HashMap<u32, FunctionValue<'ctx>>>,
2013    /// Phase E.2: when emitting a helper body (not the entry), this
2014    /// carries the IR-declared return type so `Op::Return` can pick
2015    /// the right LLVM `ret` shape. `None` while lowering the entry
2016    /// body — the entry's return shape is dictated by `EntryShape`.
2017    pub(crate) helper_ret_ty: Option<IrType>,
2018    /// Phase E.2: cached `llvm.trap` intrinsic `FunctionValue`. The
2019    /// intrinsic is declared once per module (in
2020    /// [`emit_module_funcs`]); each `Emit` snapshots the pointer so
2021    /// per-op `Div(I64)` / `Mod(I64)` guards can call it without
2022    /// re-querying the module.
2023    pub(crate) llvm_trap_fn: Option<FunctionValue<'ctx>>,
2024    /// Phase E.1: per-module const-data lookup. `Op::ConstString { idx }`
2025    /// reads the matching offset and pushes `iconst(I32, off)`.
2026    pub(crate) const_pool: &'cp ConstPool,
2027    /// Phase E.1: stack of inline call frames. `Op::Call` pushes one
2028    /// before lowering the callee body; `Op::Return` inside the
2029    /// callee body pops the typed value into the topmost frame's
2030    /// result alloca and jumps to its exit block. The callee's
2031    /// `LocalGet(idx)` resolves to `params[idx]` rather than the
2032    /// entry's LLVM params; `LetGet/LetSet` indices are remapped
2033    /// against `let_offset` so concurrent inline frames don't clash.
2034    pub(crate) inline_frames: Vec<InlineFrame<'ctx>>,
2035    /// Phase E.1: did the body emit a pointer-indirect StoreField?
2036    /// When set, the buffer-protocol epilogue returns the post-bump
2037    /// tail cursor (in bytes past `out_ptr`) rather than the
2038    /// statically-known `buffer_return_size`. Mirrors cranelift's
2039    /// `needs_tail_cursor` flag.
2040    pub(crate) needs_tail_cursor: bool,
2041    /// In-place region-walk return ABI (S2): set by `emit_store_field`
2042    /// when the entry returns a `List<List<scalar>>` sourced directly
2043    /// from a `#main` parameter. Holds the **arena-relative** i32 offset
2044    /// of the root list header (the value `Op::LoadListListPtr` pushed,
2045    /// already rebased by `in_ptr`). No bytes are copied into `out_buf`;
2046    /// instead the buffer epilogue (`emit_return`) encodes this offset as
2047    /// the negative in-place sentinel `-(root_abs + 1)` and returns it,
2048    /// telling the host to verify + decode the value in place at its
2049    /// source region rather than at `out_ptr`. `None` for every other
2050    /// return shape, which keeps the existing `buffer_return_size` /
2051    /// tail-cursor epilogue. Mirrors cranelift's `inplace_return_root`.
2052    pub(crate) inplace_return_root: Option<IntValue<'ctx>>,
2053    /// Phase F.W7: ordered list of lambda `FunctionValue`s, indexed by
2054    /// `fn_table_idx`. `Op::MakeClosure { fn_table_idx }` stamps the
2055    /// matching index into the closure handle's `fn_table_idx` slot
2056    /// and uses the same lookup to resolve the function pointer to
2057    /// stash. `Op::CallClosure` reads the handle's `fn_table_idx`
2058    /// slot and dispatches indirectly through a private global table
2059    /// of function pointers seeded from this list. Empty when the
2060    /// module contains no lambdas.
2061    pub(crate) closure_fn_table: Vec<FunctionValue<'ctx>>,
2062    /// Phase F.W7: per-IR-`record_local_idx` allocas backing
2063    /// `Op::AllocRootRecord` / `Op::StoreFieldAtRecord`. The slot
2064    /// holds an i32 out_ptr-relative offset; `AllocRootRecord` writes
2065    /// `0` there (root sits at `out_ptr + 0`), `StoreFieldAtRecord`
2066    /// reads it back to compute the destination address. Mirrors
2067    /// cranelift's `record_locals` map.
2068    pub(crate) record_locals: std::collections::HashMap<u32, PointerValue<'ctx>>,
2069    /// Phase H: bytes literal pushed by the *immediately preceding*
2070    /// `Op::ConstString` op (i.e. still the top-of-stack at the start
2071    /// of the next `lower_op` call). Cleared at the start of every
2072    /// `lower_op` and re-populated by the `Op::ConstString` arm at
2073    /// its tail. The `Op::Call` arm reads this when `fn_index ==
2074    /// STDLIB_IDX_CONTAINS` to detect the const-needle case and
2075    /// inline a tight byte-scan loop, skipping the
2076    /// `relon_llvm_str_contains_arena` extern shim's FFI boundary
2077    /// (~10-15 cycles of prologue/epilogue per call on x86_64). On
2078    /// the W4 / W4_long hot loops the needle is always a
2079    /// compile-time const (`"x"`), so the const-needle fast path
2080    /// fires 100% of iters. Stays `None` when the needle came in via
2081    /// `LocalGet` / `LetGet` / any non-`ConstString` producer — those
2082    /// fall through to the existing extern path.
2083    pub(crate) last_const_string: Option<Vec<u8>>,
2084    /// Phase F.W7 self-recursion fast path: per-lambda map of captures
2085    /// struct offsets that hold a self-recursive closure handle, keyed
2086    /// by the `fn_table_idx` of the enclosing lambda. Populated only
2087    /// for lambda bodies (the entry / helpers leave it empty); the
2088    /// scanner in `build_self_capture_table` correlates each
2089    /// `Op::MakeClosure` in the entry with the immediately following
2090    /// `LetSet { idx, ty: Closure }` to identify captures whose
2091    /// `cap.let_idx == idx` (i.e. the binding being assigned right
2092    /// after MakeClosure — the canonical IR shape for a self-recursive
2093    /// closure-as-value let). The value `Vec<(offset,
2094    /// self_fn_table_idx)>` lets the lambda-prologue `Op::LocalGet(0);
2095    /// Op::LoadI32AtAbsolute { offset }` chain stamp the matching
2096    /// [`Provenance::OwnCaptureHandle`] on the produced handle so the
2097    /// downstream `Op::CallClosure` can pick the direct-call fast path
2098    /// (skip handle deref, skip switch, reuse the lambda's own
2099    /// captures_ptr LLVM param 1). Empty when the lambda has no
2100    /// self-recursive captures or when self-recursion detection is
2101    /// unavailable (legacy / fixture entries that bypass the
2102    /// MakeClosure → LetSet pattern).
2103    pub(crate) self_capture_offsets: Vec<(u32, u32)>,
2104    /// Phase F.W7 self-recursion fast path: let-slot indices that hold
2105    /// a self-recursive closure handle along with the enclosing
2106    /// lambda's `fn_table_idx`. Populated by `Op::LetSet` when the
2107    /// stored value carries [`Provenance::OwnCaptureHandle`] so the
2108    /// matching `Op::LetGet` can re-emit the provenance — this is what
2109    /// lets the recursive `fib(k - 1)` call site (which always goes
2110    /// through `LetGet`) keep the self-recursion fast path intact.
2111    pub(crate) self_capture_let_slots: std::collections::HashMap<u32, (u32, u32)>,
2112    /// Phase F.W7 self-recursion fast path: captures_ptr LLVM param
2113    /// (param 1) of the enclosing lambda. Cached so the closure-call
2114    /// emitter can pass it straight into the recursive call without
2115    /// re-loading from the closure handle. `None` when emitting the
2116    /// entry / a helper (not a lambda body) — the self-recursion fast
2117    /// path is gated on this being `Some`.
2118    pub(crate) captures_ptr_param: Option<IntValue<'ctx>>,
2119    /// Phase D.2 fast-path entry: let-slot indices holding a
2120    /// virtualised closure stamped by an in-body `Op::MakeClosure`
2121    /// (carries `Provenance::FastPathClosure`). The `LetSet` that
2122    /// catches such a value stashes the `fn_table_idx` here so the
2123    /// matching `LetGet` can re-emit the provenance, keeping the
2124    /// `CallClosure` direct-call rewrite alive across the let chain.
2125    /// Empty when not emitting the fast-path entry.
2126    pub(crate) fast_path_closure_let_slots: std::collections::HashMap<u32, u32>,
2127    /// Phase L W3: let-slot indices holding a `Provenance::ConstString`
2128    /// value (i.e. the let was set from a value sourced — directly or
2129    /// via prior `LetGet` chains — from an `Op::ConstString`). The
2130    /// matching `LetGet` re-stamps the provenance so the downstream
2131    /// `Op::Add(String)` lowering can switch to the const-len /
2132    /// single-byte-store fast path. Each entry records (len, optional
2133    /// first_byte). Empty by default; entries survive only across
2134    /// inner-loop iterations because the W3 reduce shape's `s` let is
2135    /// re-set every iteration from the same const literal.
2136    pub(crate) const_string_let_slots: std::collections::HashMap<u32, (u32, Option<u8>)>,
2137    /// Devirtualisation (W18): let-slot indices holding a real
2138    /// arena-resident closure handle whose `fn_table_idx` is a
2139    /// compile-time constant (`Provenance::KnownClosure`). The `LetSet`
2140    /// that catches such a value stashes the `fn_table_idx` here so the
2141    /// matching `LetGet` re-stamps the provenance, letting the downstream
2142    /// `CallClosure` emit a direct call (LLVM inlines it) instead of the
2143    /// runtime `switch i32 %cc_fn_idx`. A non-known-closure `LetSet`
2144    /// against the same slot wipes the entry so a later `LetGet` cannot
2145    /// fraudulently claim a static target. Empty by default.
2146    pub(crate) known_closure_let_slots: std::collections::HashMap<u32, u32>,
2147    /// Devirtualisation (W18): `(capture_offset, captured_fn_table_idx)`
2148    /// pairs for the lambda body currently being emitted, identifying
2149    /// captures-struct offsets that hold a handle produced by a literal
2150    /// `MakeClosure` with a compile-time-constant `fn_table_idx` (a
2151    /// *known* closure that is NOT a self-capture). The capture-load
2152    /// prologue (`LocalGet(0); LoadI32AtAbsolute { offset }`) stamps
2153    /// [`Provenance::KnownClosure`] on the matching load so a body
2154    /// `CallClosure` against the capture emits a direct call. Seeded by
2155    /// [`build_known_capture_table`]; empty when emitting the entry /
2156    /// helpers or a lambda with no such captures.
2157    pub(crate) known_capture_offsets: Vec<(u32, u32)>,
2158    /// Phase 0b native dispatch: the module's `#native` imports, in
2159    /// `import_idx` order. `Op::CallNative` validates the call's
2160    /// `import_idx` / param-shape / ret-ty against this table before
2161    /// emitting the dispatch (mirrors cranelift's `self.ir.imports`
2162    /// check). Empty for hand-built fixtures / fast / helper / lambda
2163    /// emits — those never carry a `CallNative`, so the validation arm
2164    /// surfaces a precise `Codegen` error if one slips through.
2165    pub(crate) imports: &'cp [relon_ir::ir::NativeImport],
2166    /// Phase 0b native dispatch: the declared `relon_llvm_call_native`
2167    /// helper `FunctionValue`. `Op::CallNative` emits a `call` against
2168    /// it. `None` outside the buffer-protocol entry (the only shape
2169    /// that carries a `*state` pointer to thread through).
2170    pub(crate) call_native_fn: Option<FunctionValue<'ctx>>,
2171    /// Stage 1.B: open-world (dynamic helper) vs closed-world (static
2172    /// direct `call @<host_symbol>`) native dispatch. Defaults to
2173    /// [`WorldMode::OpenWorld`] so MCJIT / `from_source` are untouched;
2174    /// only `crate::cocompile` flips it to `ClosedWorld`.
2175    pub(crate) world_mode: WorldMode,
2176    /// P3 §2.2: the codegen target. Defaults to
2177    /// [`CodegenTarget::Native`]; only the wasm32 object-emit path flips
2178    /// it to [`CodegenTarget::Wasm32`]. On wasm32 an open-world
2179    /// `Op::CallNative` lowers to a **wasm import** call (see
2180    /// [`crate::wasi_host`]) instead of the native MCJIT
2181    /// `relon_llvm_call_native` helper, which the sandbox cannot reach.
2182    pub(crate) target: crate::CodegenTarget,
2183    /// P3 §2.2 wasm closed-world routing: per-`import_idx` effectful flag.
2184    /// `effectful_imports[i] == true` means the host fn at import index
2185    /// `i` is capability-gated (a preceding `Op::CheckCap` guards its
2186    /// call) — an *effectful* fn that must cross the sandbox boundary as a
2187    /// **WASI import**, not be inlined into the wasm unit. `false` (or an
2188    /// out-of-range index on the legacy / native paths) means pure-compute:
2189    /// co-compile + inline. Empty slice on every path except wasm32
2190    /// closed-world; the wasm closed-world emit
2191    /// (`emit_module_funcs_closed_world_wasm`) populates it from the IR's
2192    /// CheckCap → CallNative shape.
2193    pub(crate) effectful_imports: &'cp [bool],
2194}
2195
2196/// Phase E.1: per-call inline-frame state. One entry per active
2197/// stdlib `Op::Call`; the callee body lowers against the topmost
2198/// frame.
2199pub(crate) struct InlineFrame<'ctx> {
2200    /// LLVM values bound to the callee's `LocalGet(0..arity)` reads.
2201    /// Order matches the IR's declared parameter order — the
2202    /// `Op::Call` site popped them from the caller's operand stack
2203    /// (top-of-stack = last param) and reversed.
2204    pub(crate) params: Vec<TypedValue<'ctx>>,
2205    /// Offset added to the callee's `LetGet/LetSet` indices so its
2206    /// let-bindings don't alias the caller's slots. Mirrors the
2207    /// cranelift backend's `let_offset`.
2208    pub(crate) let_offset: u32,
2209    /// Result alloca + exit basic block. The callee's `Op::Return`
2210    /// stores the popped value into the alloca and unconditionally
2211    /// branches to `exit_bb`; the caller continues from there with a
2212    /// matching load.
2213    pub(crate) ret_slot: PointerValue<'ctx>,
2214    /// LLVM type stored at [`Self::ret_slot`]. Pre-computed from the
2215    /// IR-declared `ret_ty` of the stdlib call so the caller-side
2216    /// load knows what width to read.
2217    pub(crate) ret_ty: IrType,
2218    /// Branch target for `Op::Return` inside the callee body. The
2219    /// caller positions the builder here after the inline finishes
2220    /// and pushes the loaded return value back onto the operand
2221    /// stack.
2222    pub(crate) exit_bb: inkwell::basic_block::BasicBlock<'ctx>,
2223}
2224
2225/// Phase D.1 fast-path emission state. Carried inside [`Emit`] when
2226/// lowering the typed fast entry.
2227#[derive(Clone)]
2228pub(crate) struct FastEmit<'ctx> {
2229    pub(crate) profile: FastPathProfile,
2230    /// Alloca holding the i64 return value. Trailing `StoreField`
2231    /// at `profile.ret_offset` writes into this slot; `Op::Return`
2232    /// loads from it.
2233    pub(crate) ret_slot: PointerValue<'ctx>,
2234}
2235
2236#[derive(Clone, Copy)]
2237pub(crate) struct TypedValue<'ctx> {
2238    pub(crate) val: IntValue<'ctx>,
2239    /// IR-level tag of `val`. Recorded so Phase C predicates that
2240    /// inspect operand types (signed-vs-unsigned cmp, F64 routing)
2241    /// have it on hand without re-deriving from LLVM bit width.
2242    /// Phase B never consumes this field; `#[allow(dead_code)]`
2243    /// keeps the lint clean while we're still wiring future Op
2244    /// support.
2245    #[allow(dead_code)]
2246    pub(crate) ty: IrType,
2247    /// Provenance hint used by [`Emit::emit_call_closure`] to detect
2248    /// self-recursive closure calls. Defaults to [`Provenance::None`]
2249    /// for every push that doesn't go through the lambda-prologue
2250    /// capture path; the closure-self-call fast path only fires when
2251    /// the consumed handle's provenance points at one of the lambda's
2252    /// own self-capture offsets.
2253    pub(crate) prov: Provenance,
2254}
2255
2256/// Tracks where an [`IntValue`] on the operand stack came from so the
2257/// closure-call emitter can detect self-recursion without re-loading
2258/// the handle's captures pointer through arena indirection.
2259///
2260/// The W7 production source's `fib` closure captures itself, so every
2261/// recursive `fib(k - 1)` call site walks
2262/// `captures_ptr -> self_handle -> captures_ptr_field -> direct call`.
2263/// LLVM cannot fold the `captures_ptr_field` load back to the input
2264/// `captures_ptr` because the chain crosses `MakeClosure` in another
2265/// function (no IPA reach), so a pure post-O3 IR ends up with three
2266/// arena loads per recursion (`~10 ns/call ≈ +170 µs` over `fib(22)`).
2267///
2268/// The provenance bits below are enough to short-circuit:
2269///
2270/// * `OwnCapturesPtr` — the value is the lambda's own captures_ptr arg
2271///   (LLVM param 1). Produced by `Op::LocalGet(0)` inside a lambda.
2272/// * `OwnCaptureHandle { offset, self_fn_table_idx }` — the value is a
2273///   closure handle loaded from `captures_ptr + offset` and the
2274///   matching `MakeClosure` capture is self-recursive (handle points
2275///   back at the enclosing lambda whose `fn_table_idx ==
2276///   self_fn_table_idx`). Lets `Op::CallClosure` emit a direct call to
2277///   `closure_fn_table[self_fn_table_idx]` with the current
2278///   `captures_ptr` arg — no handle deref, no switch, no trap branch.
2279#[derive(Clone, Copy, Debug, PartialEq, Eq)]
2280pub(crate) enum Provenance {
2281    None,
2282    /// LLVM param 1 of the enclosing lambda — the captures_ptr arg.
2283    OwnCapturesPtr,
2284    /// Closure handle loaded from `captures_ptr + offset`; the matching
2285    /// MakeClosure capture is self-recursive, so the handle's
2286    /// `captures_ptr` field equals `OwnCapturesPtr` and the handle's
2287    /// `fn_table_idx` equals `self_fn_table_idx`.
2288    OwnCaptureHandle {
2289        #[allow(dead_code)]
2290        offset: u32,
2291        self_fn_table_idx: u32,
2292    },
2293    /// Phase D.2: closure handle materialised by a `MakeClosure` op
2294    /// inside the fast-path entry. The fast entry has no arena/state,
2295    /// so `MakeClosure` cannot bump-allocate the 8-byte handle record;
2296    /// instead the value is virtualised — we remember the
2297    /// `fn_table_idx` and rewrite the matching `CallClosure` into a
2298    /// direct call against the lambda function. The lambda's
2299    /// `(state, captures_ptr, args...)` signature is satisfied by
2300    /// passing null / zero for state / captures, which is sound for
2301    /// W7-style self-recursive closures whose post-O3 body drops
2302    /// both args.
2303    FastPathClosure {
2304        fn_table_idx: u32,
2305    },
2306    /// Devirtualisation (W18, 2026-05-30): the IntValue is a *real*
2307    /// arena-resident closure handle (`[fn_table_idx][captures_ptr]`)
2308    /// produced by a literal [`Op::MakeClosure`] whose `fn_table_idx` is
2309    /// a compile-time constant. Unlike [`Self::FastPathClosure`] the
2310    /// handle is fully materialised in the arena (the buffer-protocol
2311    /// entry has state + arena), so the matching `CallClosure` still
2312    /// loads the real `captures_ptr` from `handle + 4` — it only skips
2313    /// the runtime `switch i32 %cc_fn_idx` over `handle + 0`, because the
2314    /// handle's `fn_table_idx` word is *provably* this constant.
2315    ///
2316    /// Soundness: the value flows unmodified from the `MakeClosure` (or a
2317    /// `LetSet`/`LetGet` round-trip, or an inline-frame argument bind)
2318    /// to the `CallClosure`; there is exactly one possible callee, so the
2319    /// switch's runtime selection is statically decided. The slow-path
2320    /// `build_switch` stays for any handle that did *not* arrive with
2321    /// this provenance (a genuinely-dynamic dispatch). When the W18
2322    /// `_list_filter` predicate (a literal `(k) => is_prime(k, 2)`
2323    /// MakeClosure) is inlined into the bundled `list_int_filter` body,
2324    /// this lets the per-element predicate dispatch become a direct call
2325    /// LLVM then inlines, killing the hot-loop switch.
2326    KnownClosure {
2327        fn_table_idx: u32,
2328    },
2329    /// Phase L W3 (2026-05-28): the IntValue is an i32 arena offset to a
2330    /// `[len:u32 LE][payload]` String record whose payload was placed in
2331    /// the const-pool prefix at module build time, so its length is
2332    /// known at compile time. Carried by `Op::ConstString` and
2333    /// propagated through `Op::LetSet { ty: String }` →
2334    /// `Op::LetGet { ty: String }` so `Op::Add(String)` can feed the
2335    /// const length to LLVM (memcpy intrinsic with const size lowers
2336    /// to inline stores) and skip the per-iter `[len]` header reload.
2337    ///
2338    /// Single-byte payloads (the W3 reduce hot loop's `"a"`) further
2339    /// expose `first_byte` so the in-place fast path can emit a single
2340    /// `i8 store` instead of `memcpy` — bypassing the LLVM lowering
2341    /// pass altogether for the dominant reduce shape.
2342    ConstString {
2343        len: u32,
2344        /// `Some(byte)` when `len == 1` so the lowering can emit an
2345        /// inline `store i8 byte, dst` instead of a memcpy intrinsic.
2346        /// `None` for longer payloads (LLVM's memcpy intrinsic
2347        /// lowering still handles those well once the size is const).
2348        first_byte: Option<u8>,
2349    },
2350}
2351
2352#[derive(Clone, Copy, Debug, PartialEq, Eq)]
2353pub(crate) enum LabelKind {
2354    /// `Br` jumps **past** the block (forward exit).
2355    Block,
2356    /// `Br` jumps **back** to the loop header (continue).
2357    Loop,
2358}
2359
2360#[derive(Clone, Copy)]
2361pub(crate) struct LabelFrame<'ctx> {
2362    /// Header basic block. For Block this is unused for branching
2363    /// (we never branch backward to the start of a block); for Loop
2364    /// it's the target of a `Br` (continue).
2365    pub(crate) header_bb: inkwell::basic_block::BasicBlock<'ctx>,
2366    /// Tail basic block — what code after the block / after the
2367    /// loop falls through to. For Block this is the `Br` target;
2368    /// for Loop the surrounding code lives here.
2369    pub(crate) tail_bb: inkwell::basic_block::BasicBlock<'ctx>,
2370    pub(crate) kind: LabelKind,
2371}
2372
2373impl<'ctx, 'b, 'cp> Emit<'ctx, 'b, 'cp> {
2374    #[allow(clippy::too_many_arguments)]
2375    pub(crate) fn new(
2376        ctx: &'ctx Context,
2377        builder: &'b Builder<'ctx>,
2378        module: &'b LlvmModule<'ctx>,
2379        func: FunctionValue<'ctx>,
2380        shape: EntryShape,
2381        arena_base_ptr: Option<PointerValue<'ctx>>,
2382        state_ptr: Option<PointerValue<'ctx>>,
2383        buffer_return_size: u32,
2384        const_pool: &'cp ConstPool,
2385    ) -> Self {
2386        Self {
2387            ctx,
2388            builder,
2389            func,
2390            module,
2391            shape,
2392            arena_base_ptr,
2393            state_ptr,
2394            stack: Vec::with_capacity(8),
2395            let_slots: std::collections::HashMap::new(),
2396            let_floor: 0,
2397            param_base: 0,
2398            label_stack: Vec::new(),
2399            name_seq: 0,
2400            buffer_return_size,
2401            fast_path: None,
2402            helper_table: None,
2403            helper_ret_ty: None,
2404            llvm_trap_fn: None,
2405            const_pool,
2406            inline_frames: Vec::new(),
2407            needs_tail_cursor: false,
2408            inplace_return_root: None,
2409            last_const_string: None,
2410            closure_fn_table: Vec::new(),
2411            record_locals: std::collections::HashMap::new(),
2412            self_capture_offsets: Vec::new(),
2413            self_capture_let_slots: std::collections::HashMap::new(),
2414            captures_ptr_param: None,
2415            fast_path_closure_let_slots: std::collections::HashMap::new(),
2416            const_string_let_slots: std::collections::HashMap::new(),
2417            known_closure_let_slots: std::collections::HashMap::new(),
2418            known_capture_offsets: Vec::new(),
2419            imports: &[],
2420            call_native_fn: None,
2421            world_mode: WorldMode::OpenWorld,
2422            target: crate::CodegenTarget::Native,
2423            effectful_imports: &[],
2424        }
2425    }
2426
2427    pub(crate) fn next_name(&mut self, hint: &str) -> String {
2428        self.name_seq += 1;
2429        format!("{hint}_{}", self.name_seq)
2430    }
2431
2432    // -- stack helpers --------------------------------------------------
2433
2434    pub(crate) fn push(&mut self, v: IntValue<'ctx>, ty: IrType) {
2435        self.stack.push(TypedValue {
2436            val: v,
2437            ty,
2438            prov: Provenance::None,
2439        });
2440    }
2441
2442    /// Push a value while attaching a [`Provenance`] tag. Currently
2443    /// only emitted by the lambda-prologue capture path
2444    /// (`LocalGet(0)` → `LoadI32AtAbsolute` → `LetSet/LetGet`) so
2445    /// `emit_call_closure` can short-circuit self-recursive calls.
2446    pub(crate) fn push_with_prov(&mut self, v: IntValue<'ctx>, ty: IrType, prov: Provenance) {
2447        self.stack.push(TypedValue { val: v, ty, prov });
2448    }
2449
2450    /// Phase F.W7 self-recursion fast path: peek the operand stack's
2451    /// top-of-stack provenance without consuming it and return the
2452    /// matching [`Provenance::OwnCaptureHandle`] when the top is the
2453    /// lambda's captures_ptr and `offset` matches a recorded self-
2454    /// recursive capture offset. Returns `None` otherwise — the
2455    /// caller then leaves the produced value's provenance at
2456    /// [`Provenance::None`] and the closure-call emitter falls back
2457    /// to the slow-path switch dispatch.
2458    ///
2459    /// Caller uses this **after** `emit_load_at_absolute` pops the
2460    /// base; we read the stack top here before that pop runs, so
2461    /// the lookup remains correct (the base is still on top when
2462    /// the dispatcher arm fires).
2463    pub(crate) fn peek_self_capture_provenance(&self, offset: u32) -> Option<Provenance> {
2464        let top = self.stack.last()?;
2465        if !matches!(top.prov, Provenance::OwnCapturesPtr) {
2466            return None;
2467        }
2468        // Self-recursive capture wins (its `captures_ptr`-reuse direct
2469        // path is strictly cheaper than re-loading the handle's
2470        // captures_ptr field).
2471        for (cap_offset, self_fn_table_idx) in &self.self_capture_offsets {
2472            if *cap_offset == offset {
2473                return Some(Provenance::OwnCaptureHandle {
2474                    offset,
2475                    self_fn_table_idx: *self_fn_table_idx,
2476                });
2477            }
2478        }
2479        // Devirtualisation (W18): a capture of a known (non-self)
2480        // closure. Stamp `KnownClosure` so the body's `CallClosure`
2481        // against the capture emits a direct call (still loading the
2482        // capture's own captures_ptr) instead of the runtime switch.
2483        for (cap_offset, captured_fn_table_idx) in &self.known_capture_offsets {
2484            if *cap_offset == offset {
2485                return Some(Provenance::KnownClosure {
2486                    fn_table_idx: *captured_fn_table_idx,
2487                });
2488            }
2489        }
2490        None
2491    }
2492
2493    pub(crate) fn pop(&mut self, ip_hint: &str) -> Result<TypedValue<'ctx>, LlvmError> {
2494        self.stack.pop().ok_or_else(|| {
2495            LlvmError::Codegen(format!(
2496                "operand stack underflow at {ip_hint}: producer emitted an Op with no matching push"
2497            ))
2498        })
2499    }
2500
2501    pub(crate) fn pop_int(&mut self, ip_hint: &str) -> Result<IntValue<'ctx>, LlvmError> {
2502        self.pop(ip_hint).map(|tv| tv.val)
2503    }
2504
2505    // -- locals / lets --------------------------------------------------
2506
2507    pub(crate) fn lookup_param(&self, idx: u32) -> Result<IntValue<'ctx>, LlvmError> {
2508        let llvm_idx = self
2509            .param_base
2510            .checked_add(idx)
2511            .ok_or_else(|| LlvmError::Codegen(format!("LocalGet({idx}): param idx overflow")))?;
2512        let p = self.func.get_nth_param(llvm_idx).ok_or_else(|| {
2513            LlvmError::Codegen(format!(
2514                "LocalGet({idx}) -> llvm param #{llvm_idx} out of range; function has {} param(s)",
2515                self.func.count_params()
2516            ))
2517        })?;
2518        match p {
2519            BasicValueEnum::IntValue(v) => Ok(v),
2520            other => Err(LlvmError::Codegen(format!(
2521                "LocalGet({idx}) llvm param #{llvm_idx} is {other:?}, expected IntValue"
2522            ))),
2523        }
2524    }
2525
2526    pub(crate) fn ensure_let_slot(
2527        &mut self,
2528        idx: u32,
2529        ty: IrType,
2530    ) -> Result<PointerValue<'ctx>, LlvmError> {
2531        if let Some((ptr, existing_ty)) = self.let_slots.get(&idx) {
2532            if *existing_ty != ty {
2533                return Err(LlvmError::Codegen(format!(
2534                    "let-slot {idx} aliased: previous type {existing_ty:?}, new type {ty:?}"
2535                )));
2536            }
2537            return Ok(*ptr);
2538        }
2539        // Allocate in the function's entry block so the alloca is
2540        // hoisted out of any loop body. inkwell's `build_alloca`
2541        // emits at the current position, so we temporarily reposition.
2542        let entry_bb = self.func.get_first_basic_block().ok_or_else(|| {
2543            LlvmError::Codegen("ensure_let_slot: function has no entry block".into())
2544        })?;
2545        let cur = self.builder.get_insert_block();
2546        // Position at the start of the entry block so allocas group
2547        // at the top — LLVM mem2reg requires this canonical layout
2548        // to promote slots into SSA.
2549        if let Some(first_instr) = entry_bb.get_first_instruction() {
2550            self.builder.position_before(&first_instr);
2551        } else {
2552            self.builder.position_at_end(entry_bb);
2553        }
2554        let llvm_ty: inkwell::types::BasicTypeEnum<'ctx> = match ty {
2555            // AOT-1: F64 rides as i64 bits on the virtual stack, so its
2556            // let-slot is the same 64-bit-wide integer alloca as I64.
2557            // The `(idx, ty)` aliasing key keeps an I64 and an F64 slot
2558            // for the same index distinct, so the bit pattern never gets
2559            // reinterpreted across types.
2560            IrType::I64 | IrType::F64 => self.ctx.i64_type().into(),
2561            // Phase E.1: String / List* arena offsets ride on an i32
2562            // slot — matches the cranelift backend's pointer-as-i32
2563            // wire representation.
2564            //
2565            // Phase F.W7: `Closure` joins the i32-wide variants
2566            // (closure handle is an arena-relative i32 pointer at
2567            // the IR / cranelift / LLVM boundary alike).
2568            IrType::I32
2569            | IrType::Bool
2570            | IrType::Unit
2571            | IrType::String
2572            | IrType::ListInt
2573            | IrType::ListFloat
2574            | IrType::ListBool
2575            | IrType::ListString
2576            | IrType::ListSchema
2577            | IrType::ListList
2578            | IrType::Closure
2579            | IrType::Dict => self.ctx.i32_type().into(),
2580        };
2581        let name = format!("let_{idx}");
2582        let ptr = self
2583            .builder
2584            .build_alloca(llvm_ty, &name)
2585            .map_err(|e| LlvmError::Codegen(format!("let-slot {idx} alloca: {e}")))?;
2586        if let Some(bb) = cur {
2587            self.builder.position_at_end(bb);
2588        }
2589        self.let_slots.insert(idx, (ptr, ty));
2590        Ok(ptr)
2591    }
2592
2593    // -- entry point ----------------------------------------------------
2594
2595    pub(crate) fn lower_body(&mut self, body: &[TaggedOp]) -> Result<(), LlvmError> {
2596        for (ip, tagged) in body.iter().enumerate() {
2597            self.lower_op(ip, tagged)?;
2598        }
2599        // After `Op::Return` we positioned at a fresh "after_return_cont"
2600        // block which is dead and unterminated. Seal it with
2601        // `unreachable` so LLVM's verifier accepts the module. Same
2602        // pattern applies to the post-`Br` continuation block.
2603        if let Some(cur) = self.builder.get_insert_block() {
2604            if cur.get_terminator().is_none() {
2605                self.builder
2606                    .build_unreachable()
2607                    .map_err(|e| LlvmError::Codegen(format!("trailing unreachable: {e}")))?;
2608            }
2609        }
2610        Ok(())
2611    }
2612
2613    // -- per-op lowering ------------------------------------------------
2614
2615    pub(crate) fn lower_op(&mut self, ip: usize, tagged: &TaggedOp) -> Result<(), LlvmError> {
2616        let ip_hint = format!("ip={ip} op={:?}", tagged.op);
2617        // Phase H const-needle fast path: capture (and clear) the
2618        // `Op::ConstString` peek-state at the very start of every
2619        // `lower_op` dispatch. The `Op::Call` arm consults `prev_const_string`
2620        // to decide between the inline byte-scan and the extern shim.
2621        // Every other arm leaves `self.last_const_string` at `None` —
2622        // the only re-populator is the `Op::ConstString` arm at its
2623        // tail. Result: `prev_const_string.is_some()` iff the prior
2624        // emitted op was `Op::ConstString` and its value is still the
2625        // top-of-stack (no intervening op consumed it).
2626        let prev_const_string = self.last_const_string.take();
2627        match &tagged.op {
2628            // ---- literals ----
2629            Op::ConstI64(v) => {
2630                let c = self.ctx.i64_type().const_int(*v as u64, true);
2631                self.push(c, IrType::I64);
2632            }
2633            Op::ConstI32(v) => {
2634                let c = self.ctx.i32_type().const_int(*v as u32 as u64, false);
2635                self.push(c, IrType::I32);
2636            }
2637            Op::ConstBool(b) => {
2638                // Bool occupies an i32 slot on the IR's virtual stack.
2639                let c = self.ctx.i32_type().const_int(u64::from(*b), false);
2640                self.push(c, IrType::Bool);
2641            }
2642            Op::ConstF64(v) => {
2643                // AOT-1: materialise the `double` literal then bit-cast
2644                // to i64 so the operand stack stays integer-typed
2645                // (Option B). `v` is an `OrderedFloat<f64>`.
2646                let f = self.ctx.f64_type().const_float(v.into_inner());
2647                let bits = self
2648                    .builder
2649                    .build_bit_cast(f, self.ctx.i64_type(), &self.next_name("constf64_bits"))
2650                    .map_err(|e| LlvmError::Codegen(format!("ConstF64 bitcast: {e}")))?
2651                    .into_int_value();
2652                self.push(bits, IrType::F64);
2653            }
2654
2655            // ---- locals / lets ----
2656            Op::LocalGet(idx) => {
2657                // Phase E.1: an active inline frame redirects
2658                // `LocalGet(i)` to the inlined call's `i`-th argument
2659                // instead of the entry-function's LLVM params.
2660                if let Some(frame) = self.inline_frames.last() {
2661                    let i = *idx as usize;
2662                    let tv = frame.params.get(i).ok_or_else(|| {
2663                        LlvmError::Codegen(format!(
2664                            "inline LocalGet({idx}) out of range — callee has {} params",
2665                            frame.params.len()
2666                        ))
2667                    })?;
2668                    // Preserve provenance across the inline-frame argument
2669                    // bind. The bundled `list_int_filter` body reads its
2670                    // closure parameter via `LocalGet(1)`; when the caller
2671                    // passed a literal `MakeClosure` (a `KnownClosure`
2672                    // handle), forwarding that provenance lets the body's
2673                    // per-element `CallClosure` devirtualise into a direct
2674                    // call. Only `KnownClosure` is propagated here — the
2675                    // self-recursion / fast-path-entry tags depend on the
2676                    // current function's `captures_ptr_param` / fast-path
2677                    // state, which a *callee* inline frame does not share,
2678                    // so forwarding those would be unsound.
2679                    let (val, prov) = (tv.val, tv.prov);
2680                    match prov {
2681                        Provenance::KnownClosure { .. } => {
2682                            self.push_with_prov(val, tv.ty, prov);
2683                        }
2684                        _ => self.push(val, tv.ty),
2685                    }
2686                } else {
2687                    let p = self.lookup_param(*idx)?;
2688                    // The legacy envelope walks all-i64; the buffer envelope
2689                    // walks (i32 ×4, i64). The IR has the right type on
2690                    // the param descriptor, but we don't carry it through
2691                    // LocalGet — re-derive from the LLVM param width.
2692                    let width = p.get_type().get_bit_width();
2693                    let ty = if width == 32 {
2694                        IrType::I32
2695                    } else {
2696                        IrType::I64
2697                    };
2698                    // Phase F.W7 self-recursion fast path: tag
2699                    // `LocalGet(0)` inside a lambda body with
2700                    // [`Provenance::OwnCapturesPtr`] so the prologue
2701                    // capture-load chain can stamp
2702                    // [`Provenance::OwnCaptureHandle`] on self-
2703                    // recursive handles. Only fires inside a lambda
2704                    // (param_base == 1 means the LLVM param 0 is
2705                    // `*state` and param 1 is the captures_ptr arg);
2706                    // the entry / helpers leave provenance at
2707                    // `None`.
2708                    if *idx == 0 && self.captures_ptr_param.is_some() {
2709                        self.push_with_prov(p, ty, Provenance::OwnCapturesPtr);
2710                    } else {
2711                        self.push(p, ty);
2712                    }
2713                }
2714            }
2715            Op::LetSet { idx, ty } => {
2716                let v = self.pop(&ip_hint)?;
2717                let mapped = self.remap_let_idx(*idx);
2718                let slot = self.ensure_let_slot(mapped, *ty)?;
2719                // Coerce on bool / null where the producer pushed an i32
2720                // slot but the let-slot was declared as the canonical
2721                // 32-bit width.
2722                let stored = self.coerce_to_let_ty(v, *ty)?;
2723                self.builder
2724                    .build_store(slot, stored)
2725                    .map_err(|e| LlvmError::Codegen(format!("LetSet store: {e}")))?;
2726                // Phase F.W7 self-recursion fast path: when storing a
2727                // closure handle whose provenance points back at the
2728                // enclosing lambda, remember the let-slot so a later
2729                // `LetGet` resurrects the same provenance. This is
2730                // what bridges the prologue's capture-load chain
2731                // (`LocalGet(0); LoadI32AtAbsolute { offset }; LetSet
2732                // { idx, Closure }`) and the recursive call site
2733                // (`LetGet { idx, Closure }; ...; CallClosure`).
2734                if let Provenance::OwnCaptureHandle {
2735                    offset,
2736                    self_fn_table_idx,
2737                } = v.prov
2738                {
2739                    if matches!(*ty, IrType::Closure) {
2740                        self.self_capture_let_slots
2741                            .insert(mapped, (offset, self_fn_table_idx));
2742                    }
2743                }
2744                // Phase D.2 fast-path entry: when storing a virtualised
2745                // closure produced by an in-body `MakeClosure` (no
2746                // arena/state available), remember the `fn_table_idx`
2747                // so the matching `LetGet` re-emits the provenance and
2748                // the downstream `CallClosure` can rewrite into a
2749                // direct call.
2750                if let Provenance::FastPathClosure { fn_table_idx } = v.prov {
2751                    if matches!(*ty, IrType::Closure) {
2752                        self.fast_path_closure_let_slots
2753                            .insert(mapped, fn_table_idx);
2754                    }
2755                }
2756                // Devirtualisation (W18): propagate `KnownClosure`
2757                // across the `LetSet` → `LetGet` chain so a closure
2758                // handle stored into a let then read back at a
2759                // `CallClosure` site keeps its compile-time
2760                // `fn_table_idx`. A `LetSet { Closure }` of any *other*
2761                // provenance overwrites the slot with a value we cannot
2762                // prove is the same single closure, so drop the entry —
2763                // a later `LetGet` then falls back to the runtime
2764                // switch. This invalidation is what keeps a slot that is
2765                // reassigned to a dynamically-chosen closure correct.
2766                match (v.prov, *ty) {
2767                    (Provenance::KnownClosure { fn_table_idx }, IrType::Closure) => {
2768                        self.known_closure_let_slots.insert(mapped, fn_table_idx);
2769                    }
2770                    (_, IrType::Closure) => {
2771                        self.known_closure_let_slots.remove(&mapped);
2772                    }
2773                    _ => {}
2774                }
2775                // Phase L W3: propagate `Provenance::ConstString`
2776                // across the `LetSet` → `LetGet` chain so the reduce
2777                // closure's `s` (set every iteration from the same
2778                // const literal "a" in the W3 source) can be picked
2779                // up by `Op::Add(String)` as a const-len operand.
2780                // Any non-const-string `LetSet` against the same idx
2781                // wipes the entry below.
2782                match (v.prov, *ty) {
2783                    (Provenance::ConstString { len, first_byte }, IrType::String) => {
2784                        self.const_string_let_slots
2785                            .insert(mapped, (len, first_byte));
2786                    }
2787                    (_, IrType::String) => {
2788                        // A non-const value just overwrote the slot —
2789                        // drop any stale const-string record so a
2790                        // later `LetGet` cannot fraudulently claim
2791                        // const-len status.
2792                        self.const_string_let_slots.remove(&mapped);
2793                    }
2794                    _ => {}
2795                }
2796            }
2797            Op::LetGet { idx, ty } => {
2798                // Phase E.1: remap the callee's let-idx against the
2799                // active inline frame so concurrent stdlib inlines
2800                // don't clash on slot numbers.
2801                let mapped = self.remap_let_idx(*idx);
2802                let slot = self.ensure_let_slot(mapped, *ty)?;
2803                let llvm_ty: inkwell::types::BasicTypeEnum<'ctx> = match *ty {
2804                    // AOT-1: F64 rides as i64 bits, so its let-slot loads
2805                    // back as an i64 (the raw bit pattern, reinterpreted
2806                    // as `double` only at the arithmetic / store site).
2807                    IrType::I64 | IrType::F64 => self.ctx.i64_type().into(),
2808                    IrType::I32
2809                    | IrType::Bool
2810                    | IrType::Unit
2811                    | IrType::String
2812                    | IrType::ListInt
2813                    | IrType::ListFloat
2814                    | IrType::ListBool
2815                    | IrType::ListString
2816                    | IrType::ListSchema
2817                    | IrType::ListList
2818                    | IrType::Closure
2819                    | IrType::Dict => self.ctx.i32_type().into(),
2820                };
2821                let name = self.next_name("letget");
2822                let v = self
2823                    .builder
2824                    .build_load(llvm_ty, slot, &name)
2825                    .map_err(|e| LlvmError::Codegen(format!("LetGet load: {e}")))?
2826                    .into_int_value();
2827                // Phase F.W7 self-recursion fast path: when the let-slot
2828                // was populated by the lambda prologue's self-capture
2829                // load chain, re-stamp the matching
2830                // [`Provenance::OwnCaptureHandle`] so the recursive
2831                // call site (which reads the closure handle via
2832                // `LetGet`) keeps the fast-path tag alive.
2833                if matches!(*ty, IrType::Closure) {
2834                    if let Some(&(offset, self_fn_table_idx)) =
2835                        self.self_capture_let_slots.get(&mapped)
2836                    {
2837                        self.push_with_prov(
2838                            v,
2839                            *ty,
2840                            Provenance::OwnCaptureHandle {
2841                                offset,
2842                                self_fn_table_idx,
2843                            },
2844                        );
2845                    } else if let Some(&fn_table_idx) =
2846                        self.fast_path_closure_let_slots.get(&mapped)
2847                    {
2848                        // Phase D.2 fast-path entry: re-stamp the
2849                        // virtualised-closure tag so the matching
2850                        // `CallClosure` keeps the direct-call rewrite
2851                        // available.
2852                        self.push_with_prov(v, *ty, Provenance::FastPathClosure { fn_table_idx });
2853                    } else if let Some(&fn_table_idx) = self.known_closure_let_slots.get(&mapped) {
2854                        // Devirtualisation (W18): re-stamp `KnownClosure`
2855                        // so a `CallClosure` reading this handle through
2856                        // the let chain emits a direct call (still
2857                        // loading the real captures_ptr) instead of the
2858                        // runtime switch.
2859                        self.push_with_prov(v, *ty, Provenance::KnownClosure { fn_table_idx });
2860                    } else {
2861                        self.push(v, *ty);
2862                    }
2863                } else if matches!(*ty, IrType::String) {
2864                    // Phase L W3: re-stamp `Provenance::ConstString`
2865                    // when the let-slot is known to hold a value
2866                    // sourced from `Op::ConstString`. Crucial for the
2867                    // reduce closure's `s` operand — the iter-body
2868                    // sets `s` from a const literal then `LetGet`s it
2869                    // into the `Op::Add(String)` rhs, so without
2870                    // propagation the const-len fast path can never
2871                    // fire across the let chain.
2872                    if let Some(&(len, first_byte)) = self.const_string_let_slots.get(&mapped) {
2873                        self.push_with_prov(v, *ty, Provenance::ConstString { len, first_byte });
2874                    } else {
2875                        self.push(v, *ty);
2876                    }
2877                } else {
2878                    self.push(v, *ty);
2879                }
2880            }
2881
2882            // ---- arithmetic ----
2883            Op::Add(ty) => match ty {
2884                // Phase E.1: `Op::Add(IrType::String)` is the
2885                // pair-wise String + String form (the StrConcatN
2886                // fold only fires for compile-time-known chains —
2887                // `reduce("", (acc, s) => acc + s)` lowers to a
2888                // per-iter `Add(String)`).
2889                //
2890                // Phase I (W3 string-concat gap close): emit the
2891                // in-place-append fast path. The W3 reduce hot loop
2892                // walks `acc = acc + "a"` for N iters; under the
2893                // historical inlined-`concat` body that turned into
2894                // an O(N²) byte-copy storm because every iter
2895                // reallocated a fresh scratch record. The new
2896                // helper recognises the "lhs is the most recent
2897                // scratch alloc" case at runtime and extends the
2898                // record in place — total work drops to O(N) bytes,
2899                // matching `String::push_str`. The slow path stays
2900                // bit-identical with the historical lowering so
2901                // mixed-source string adds (const-pool literals,
2902                // out-of-order scratch records) still produce a
2903                // fresh record.
2904                IrType::String => self.emit_str_add_inplace_or_concat(&ip_hint)?,
2905                _ => self.emit_binop(&ip_hint, *ty, BinOp::Add)?,
2906            },
2907            Op::Sub(ty) => self.emit_binop(&ip_hint, *ty, BinOp::Sub)?,
2908            Op::Mul(ty) => self.emit_binop(&ip_hint, *ty, BinOp::Mul)?,
2909            Op::Div(ty) => self.emit_binop(&ip_hint, *ty, BinOp::Div)?,
2910            Op::Mod(ty) => self.emit_binop(&ip_hint, *ty, BinOp::Mod)?,
2911            Op::BitAnd(ty) => self.emit_binop(&ip_hint, *ty, BinOp::BitAnd)?,
2912            Op::ConvertI64ToF64 => self.emit_convert_i64_to_f64(&ip_hint)?,
2913            Op::F64ToI64Sat => self.emit_f64_to_i64_sat(&ip_hint)?,
2914            Op::F64Unary(op) => self.emit_f64_unary(&ip_hint, *op)?,
2915            Op::F64Pow => self.emit_f64_pow(&ip_hint)?,
2916
2917            // ---- comparisons ----
2918            Op::Eq(ty) => self.emit_cmp(&ip_hint, *ty, IntPredicate::EQ)?,
2919            Op::Ne(ty) => self.emit_cmp(&ip_hint, *ty, IntPredicate::NE)?,
2920            Op::Lt(ty) => self.emit_cmp(&ip_hint, *ty, IntPredicate::SLT)?,
2921            Op::Le(ty) => self.emit_cmp(&ip_hint, *ty, IntPredicate::SLE)?,
2922            Op::Gt(ty) => self.emit_cmp(&ip_hint, *ty, IntPredicate::SGT)?,
2923            Op::Ge(ty) => self.emit_cmp(&ip_hint, *ty, IntPredicate::SGE)?,
2924
2925            // ---- buffer-protocol I/O ----
2926            Op::LoadField { offset, ty } => self.emit_load_field(*offset, *ty)?,
2927            Op::StoreField {
2928                offset,
2929                ty,
2930                inplace,
2931            } => self.emit_store_field(&ip_hint, *offset, *ty, *inplace)?,
2932
2933            // ---- pointer-indirect param loads (Phase 2 relon-rs surface) ----
2934            // String / List* `#main` parameters arrive in the input
2935            // buffer as a 4-byte buffer-relative offset to a tail
2936            // record. The IR's lowering pass emits `Op::LoadStringPtr`
2937            // (and its List* siblings) instead of `Op::LoadField {
2938            // ty: String }` so the dispatch stays unambiguous; we
2939            // share the same `emit_load_pointer_indirect_param` impl
2940            // for all variants.
2941            Op::LoadStringPtr { offset } => {
2942                self.emit_load_pointer_indirect_param(*offset, IrType::String)?
2943            }
2944            Op::LoadListIntPtr { offset } => {
2945                self.emit_load_pointer_indirect_param(*offset, IrType::ListInt)?
2946            }
2947            Op::LoadListFloatPtr { offset } => {
2948                self.emit_load_pointer_indirect_param(*offset, IrType::ListFloat)?
2949            }
2950            Op::LoadListBoolPtr { offset } => {
2951                self.emit_load_pointer_indirect_param(*offset, IrType::ListBool)?
2952            }
2953            Op::LoadListStringPtr { offset } => {
2954                self.emit_load_pointer_indirect_param(*offset, IrType::ListString)?
2955            }
2956            Op::LoadListSchemaPtr { offset } => {
2957                self.emit_load_pointer_indirect_param(*offset, IrType::ListSchema)?
2958            }
2959            Op::LoadListListPtr { offset } => {
2960                self.emit_load_pointer_indirect_param(*offset, IrType::ListList)?
2961            }
2962
2963            // ---- ReadStringLen (Phase 2 — backs `length(s)` / `len(xs)`) ----
2964            // Pop arena-relative i32 record pointer, load the leading
2965            // 4-byte length prefix, zext to i64 and push. Used by the
2966            // bundled stdlib `length` (String) / `list_*_length` bodies
2967            // — every list record shares the `[len: u32 LE]` prefix
2968            // with String, so a single lowering covers both.
2969            Op::ReadStringLen => self.emit_read_string_len(&ip_hint)?,
2970
2971            // ---- control flow ----
2972            Op::Block { result_ty, body } => self.emit_block(*result_ty, body)?,
2973            Op::Loop { result_ty, body } => self.emit_loop(*result_ty, body)?,
2974            Op::Br { label_depth } => self.emit_br(*label_depth)?,
2975            Op::BrIf { label_depth } => self.emit_br_if(&ip_hint, *label_depth)?,
2976            Op::If {
2977                result_ty,
2978                then_body,
2979                else_body,
2980            } => self.emit_if(&ip_hint, *result_ty, then_body, else_body)?,
2981
2982            // ---- return ----
2983            Op::Return => self.emit_return(&ip_hint)?,
2984
2985            // ---- Phase E.1: const-data pool ----
2986            Op::ConstString { idx, value } => {
2987                let off = self
2988                    .const_pool
2989                    .string_offsets
2990                    .get(idx)
2991                    .copied()
2992                    .ok_or_else(|| {
2993                        LlvmError::Codegen(format!(
2994                            "Op::ConstString {{ idx: {idx} }}: missing const-pool entry — \
2995                         did the host forget to lay out the pool blob before dispatch?"
2996                        ))
2997                    })?;
2998                let c = self.ctx.i32_type().const_int(u64::from(off), false);
2999                // Phase L W3: stamp const-len provenance so the
3000                // downstream `Op::Add(String)` lowering (via
3001                // `emit_str_add_inplace_or_concat`) can use the
3002                // compile-time-known length to elide the per-iter
3003                // `[len]` header reload and replace the rhs memcpy
3004                // with a single byte store when the literal is one
3005                // byte (the dominant cmp_lua W3 reduce shape). The
3006                // provenance only survives across `LetSet`/`LetGet`
3007                // for `IrType::String` (tracked in
3008                // `const_string_let_slots`) so non-String consumers
3009                // never observe it.
3010                let bytes = value.as_bytes();
3011                let len_u32 = u32::try_from(bytes.len()).map_err(|_| {
3012                    LlvmError::Codegen("ConstString length exceeds u32 range".into())
3013                })?;
3014                let first_byte = if bytes.len() == 1 {
3015                    Some(bytes[0])
3016                } else {
3017                    None
3018                };
3019                self.push_with_prov(
3020                    c,
3021                    IrType::String,
3022                    Provenance::ConstString {
3023                        len: len_u32,
3024                        first_byte,
3025                    },
3026                );
3027                // Phase H peek-state: record the literal bytes so the
3028                // next `lower_op` call can detect `Op::Call(contains)`
3029                // with this string still at top-of-stack and switch
3030                // to the inline byte-scan instead of the extern shim.
3031                // Cleared at the start of every `lower_op` — see the
3032                // `prev_const_string.take()` line at the dispatch
3033                // head — so a single intervening op (Push / Pop /
3034                // Add / ...) drops the optimisation cleanly.
3035                self.last_const_string = Some(bytes.to_vec());
3036            }
3037
3038            // ---- Phase E.1: raw-memory primitives ----
3039            Op::LoadI32AtAbsolute { offset } => {
3040                // Phase F.W7 self-recursion fast path: when the base
3041                // (top-of-stack at this point) is the lambda's own
3042                // captures_ptr arg and the offset matches a recorded
3043                // self-recursive capture slot, the result is a
3044                // closure handle whose backing struct points back at
3045                // the enclosing lambda. Stash the provenance hint
3046                // so the downstream `LetSet/LetGet/CallClosure` chain
3047                // can short-circuit the indirect dispatch. The
3048                // sniff peeks at the stack-top without mutating it;
3049                // the actual load still flows through
3050                // `emit_load_at_absolute` so we don't fork the
3051                // raw-memory primitive's lowering.
3052                let prov_hint = self.peek_self_capture_provenance(*offset);
3053                self.emit_load_at_absolute(&ip_hint, *offset, AbsLoad::I32)?;
3054                if let Some(prov) = prov_hint {
3055                    if let Some(top) = self.stack.last_mut() {
3056                        top.prov = prov;
3057                    }
3058                }
3059            }
3060            Op::LoadI64AtAbsolute { offset } => {
3061                self.emit_load_at_absolute(&ip_hint, *offset, AbsLoad::I64)?
3062            }
3063            Op::LoadI8UAtAbsolute { offset } => {
3064                self.emit_load_at_absolute(&ip_hint, *offset, AbsLoad::I8U)?
3065            }
3066            Op::LoadF64AtAbsolute { offset } => {
3067                self.emit_load_at_absolute(&ip_hint, *offset, AbsLoad::F64)?
3068            }
3069            Op::StoreI32AtAbsolute { offset } => {
3070                self.emit_store_at_absolute(&ip_hint, *offset, AbsStore::I32)?
3071            }
3072            Op::StoreI64AtAbsolute { offset } => {
3073                self.emit_store_at_absolute(&ip_hint, *offset, AbsStore::I64)?
3074            }
3075            Op::StoreI8AtAbsolute { offset } => {
3076                self.emit_store_at_absolute(&ip_hint, *offset, AbsStore::I8)?
3077            }
3078            Op::StoreF64AtAbsolute { offset } => {
3079                self.emit_store_at_absolute(&ip_hint, *offset, AbsStore::F64)?
3080            }
3081            Op::MemcpyAtAbsolute => self.emit_memcpy_at_absolute(&ip_hint)?,
3082            Op::AllocScratch { size_bytes } => self.emit_alloc_scratch_static(*size_bytes)?,
3083            Op::AllocScratchDyn => self.emit_alloc_scratch_dyn(&ip_hint)?,
3084            Op::StrConcatN { operand_count } => self.emit_str_concat_n(&ip_hint, *operand_count)?,
3085            Op::IntToStr => self.emit_int_to_str(&ip_hint)?,
3086            Op::FloatToStr => self.emit_float_to_str(&ip_hint)?,
3087
3088            // ---- Phase E.1 + E.2 call dispatch ----
3089            // stdlib indices (#278) route through the bundled-body
3090            // inline path (`emit_call_stdlib`); user-defined indices
3091            // (#279) resolve through the helper table populated by
3092            // `emit_module_funcs`.
3093            Op::Call {
3094                fn_index,
3095                arg_count,
3096                param_tys,
3097                ret_ty,
3098            } => {
3099                let stdlib_count = relon_ir::stdlib::stdlib_function_count();
3100                // Phase F.1: `contains(haystack, needle) -> Bool` short-
3101                // circuit. The bundled stdlib body is a hand-transcribed
3102                // O(s_len * p_len) byte scan that defeats LLVM's auto-
3103                // vectoriser on the inner compare loop (every iter
3104                // reloads the needle bytes through a let-slot). On the
3105                // W4 / W4_long cmp_lua rows that turns into a 3.4× /
3106                // 256× gap vs LuaJIT (which uses SIMD-accelerated
3107                // `string.find`). Route the call through the host shim
3108                // `relon_llvm_str_contains_arena` which defers to
3109                // `core::str::contains` — std's substring search backs
3110                // single-byte needles with SIMD `memchr` and uses a
3111                // Two-Way matcher for longer needles, closing the gap
3112                // without inventing a Relon-specific SIMD path.
3113                if *fn_index < stdlib_count
3114                    && relon_ir::stdlib::stdlib_function_index("contains") == Some(*fn_index)
3115                    && *arg_count == 2
3116                    && param_tys == &[IrType::String, IrType::String]
3117                    && *ret_ty == IrType::Bool
3118                {
3119                    // Phase H: when the needle was pushed by the
3120                    // immediately-preceding `Op::ConstString` (peek
3121                    // state populated at `lower_op` head), inline a
3122                    // tight byte-scan against the literal bytes.
3123                    // Skips the `relon_llvm_str_contains_arena` FFI
3124                    // boundary entirely — ~10-15 cycles of prologue /
3125                    // epilogue / IC atomic loads per call. The W4 /
3126                    // W4_long hot loops always hit this path (needle
3127                    // = `"x"` literal); dynamic-needle callers (e.g.
3128                    // `filter((s) => s.contains(other))` where
3129                    // `other` flows in via an outer let-slot) fall
3130                    // through to the existing Phase G extern shim.
3131                    if let Some(needle_bytes) = prev_const_string.as_deref() {
3132                        self.emit_str_contains_const_needle(&ip_hint, needle_bytes)?;
3133                    } else {
3134                        self.emit_str_contains_extern(&ip_hint)?;
3135                    }
3136                } else if *fn_index < stdlib_count {
3137                    self.emit_call_stdlib(&ip_hint, *fn_index, *arg_count, param_tys, *ret_ty)?
3138                } else {
3139                    self.emit_call(&ip_hint, *fn_index, *arg_count, param_tys, *ret_ty)?
3140                }
3141            }
3142
3143            // ---- Phase F.W7: anon-Dict-return record ops ----
3144            // The IR lowering pass uses `AllocRootRecord` to bind a
3145            // per-record-local i32 alloca to `0` (the root sits at
3146            // `out_ptr + 0`); subsequent `StoreFieldAtRecord` ops use
3147            // the alloca-resident offset to compute the destination
3148            // address in the output buffer's fixed area.
3149            Op::AllocRootRecord { record_local_idx } => {
3150                self.emit_alloc_root_record(*record_local_idx)?
3151            }
3152            Op::StoreFieldAtRecord {
3153                record_local_idx,
3154                offset,
3155                ty,
3156            } => self.emit_store_field_at_record(&ip_hint, *record_local_idx, *offset, *ty)?,
3157
3158            // ---- Phase F.W7: closure-as-value primitives ----
3159            Op::MakeClosure {
3160                fn_table_idx,
3161                captures,
3162                captures_size,
3163            } => self.emit_make_closure(&ip_hint, *fn_table_idx, captures, *captures_size)?,
3164            Op::CallClosure { param_tys, ret_ty } => {
3165                self.emit_call_closure(&ip_hint, param_tys, *ret_ty)?
3166            }
3167
3168            // ---- Phase 0b family seams ----
3169            // The ops below are not yet lowered by the LLVM AOT backend.
3170            // They are listed EXPLICITLY (no `_ =>` wildcard) so that
3171            // adding a new `Op` variant fails to compile here — forcing a
3172            // deliberate decision instead of a silent runtime codegen
3173            // error. Each group delegates to a thin per-family entry
3174            // point living in that family's `codegen/<family>.rs` file,
3175            // so Phase 0b agents fill one family file each WITHOUT
3176            // touching this shared dispatch (zero merge conflicts). The
3177            // stubs return the same `LlvmError::Codegen` the catch-all
3178            // used to, so today's fallback behaviour is unchanged.
3179
3180            // collections.rs — list/dict/sub-record construction
3181            Op::ConstListInt { .. }
3182            | Op::ConstListFloat { .. }
3183            | Op::ConstListBool { .. }
3184            | Op::ConstListString { .. }
3185            | Op::ConstDict { .. }
3186            | Op::DictGetByStringKey { .. }
3187            | Op::ListGetByIntIdx { .. }
3188            | Op::AllocSubRecord { .. }
3189            | Op::AllocScratchRecord { .. }
3190            | Op::PushRecordBase { .. }
3191            | Op::PushRecordBaseAbsolute { .. }
3192            | Op::StoreFieldAtRecordAbsolute { .. }
3193            | Op::EmitTailRecordFromAbsoluteAddr { .. }
3194            | Op::BuildVariantRecord { .. }
3195            | Op::BuildVariantRecordScratch { .. }
3196            | Op::BuildPointerList { .. } => {
3197                self.lower_collections_rest(ip, &ip_hint, &tagged.op)?
3198            }
3199
3200            // control.rs — multi-way / select control flow
3201            Op::Select { .. } | Op::BrTable { .. } => {
3202                self.lower_control_rest(ip, &ip_hint, &tagged.op)?
3203            }
3204
3205            // mem.rs — absolute-addressed field load
3206            Op::LoadFieldAtAbsolute { .. } => self.lower_mem_rest(ip, &ip_hint, &tagged.op)?,
3207
3208            // call.rs — native dispatch + capability gate + trap
3209            Op::CallNative { .. } | Op::CheckCap { .. } | Op::Trap { .. } => {
3210                self.lower_call_rest(ip, &ip_hint, &tagged.op)?
3211            }
3212
3213            // schema.rs — schema pointer / method dispatch
3214            Op::LoadSchemaPtr { .. } => self.lower_schema_rest(ip, &ip_hint, &tagged.op)?,
3215
3216            // unicode.rs — *TableAddr long tail
3217            Op::CaseFoldTableAddr { .. }
3218            | Op::CombiningMarkRangesAddr
3219            | Op::WhitespaceRangesAddr
3220            | Op::DecompTableAddr { .. }
3221            | Op::CccTableAddr
3222            | Op::CompositionTableAddr
3223            | Op::FullCaseFoldTableAddr { .. }
3224            | Op::CasedRangesAddr
3225            | Op::CaseIgnorableRangesAddr
3226            | Op::TurkishCaseFoldTableAddr { .. } => {
3227                self.lower_unicode_rest(ip, &ip_hint, &tagged.op)?
3228            }
3229        }
3230        Ok(())
3231    }
3232
3233    // -- Phase E.1: inline-call frame helpers --------------------------
3234
3235    /// Translate a callee `LetGet/LetSet` index against the topmost
3236    /// inline frame. Mirrors cranelift's `remap_let_idx`.
3237    pub(crate) fn remap_let_idx(&self, idx: u32) -> u32 {
3238        match self.inline_frames.last() {
3239            Some(frame) => frame.let_offset.saturating_add(idx),
3240            None => idx,
3241        }
3242    }
3243
3244    // -- helpers --------------------------------------------------------
3245
3246    pub(crate) fn coerce_to_let_ty(
3247        &self,
3248        tv: TypedValue<'ctx>,
3249        target: IrType,
3250    ) -> Result<BasicValueEnum<'ctx>, LlvmError> {
3251        let want_width = match target {
3252            // AOT-1: F64 rides as i64 bits, so its let-slot is 64-wide
3253            // (same as I64). Coercion stays a width match — never an
3254            // int<->float cast — because the stack value is the raw
3255            // bit pattern, not a `double`.
3256            IrType::I64 | IrType::F64 => 64,
3257            IrType::I32
3258            | IrType::Bool
3259            | IrType::Unit
3260            | IrType::String
3261            | IrType::ListInt
3262            | IrType::ListFloat
3263            | IrType::ListBool
3264            | IrType::ListString
3265            | IrType::ListSchema
3266            | IrType::ListList
3267            | IrType::Closure
3268            | IrType::Dict => 32,
3269        };
3270        let have_width = tv.val.get_type().get_bit_width();
3271        if have_width == want_width {
3272            return Ok(tv.val.into());
3273        }
3274        let target_ty = if want_width == 64 {
3275            self.ctx.i64_type()
3276        } else {
3277            self.ctx.i32_type()
3278        };
3279        if have_width < want_width {
3280            self.builder
3281                .build_int_z_extend(tv.val, target_ty, "let_zext")
3282                .map(|v| v.as_basic_value_enum())
3283                .map_err(|e| LlvmError::Codegen(format!("let zext: {e}")))
3284        } else {
3285            self.builder
3286                .build_int_truncate(tv.val, target_ty, "let_trunc")
3287                .map(|v| v.as_basic_value_enum())
3288                .map_err(|e| LlvmError::Codegen(format!("let trunc: {e}")))
3289        }
3290    }
3291
3292    // -- control flow ---------------------------------------------------
3293}
3294
3295/// Inline lookup table used by `emit_load_field`. Picks the LLVM
3296/// integer type + the IR tag we push back onto the operand stack
3297/// for a Phase-B-supported scalar field type.
3298impl<'ctx, 'b, 'cp> Emit<'ctx, 'b, 'cp> {}
3299
3300// ---------------------------------------------------------------------------
3301// Phase E.1: raw-memory primitives, scratch allocator, StrConcatN.
3302// ---------------------------------------------------------------------------
3303
3304impl<'ctx, 'b, 'cp> Emit<'ctx, 'b, 'cp> {
3305    /// Map an `IrType` to the LLVM int type used for the operand stack
3306    /// representation. Used by `Op::MakeClosure` capture reads and
3307    /// `Op::CallClosure` return loads.
3308    pub(crate) fn ir_ty_to_llvm_int(
3309        &self,
3310        ty: IrType,
3311    ) -> Result<inkwell::types::IntType<'ctx>, LlvmError> {
3312        match ty {
3313            IrType::I64 | IrType::F64 => Ok(self.ctx.i64_type()),
3314            IrType::I32
3315            | IrType::Bool
3316            | IrType::Unit
3317            | IrType::String
3318            | IrType::ListInt
3319            | IrType::ListFloat
3320            | IrType::ListBool
3321            | IrType::ListString
3322            | IrType::ListSchema
3323            | IrType::ListList
3324            | IrType::Closure
3325            | IrType::Dict => Ok(self.ctx.i32_type()),
3326        }
3327    }
3328}
3329
3330#[cfg(test)]
3331mod const_pool_tests {
3332    //! Byte-level layout pins for the `ConstList*` const-pool records.
3333    //!
3334    //! These are the cross-backend arena data contract: the bytes the
3335    //! LLVM `ConstPool` lays out for `ConstListInt` / `ConstListFloat`
3336    //! / `ConstListBool` must be byte-identical to what
3337    //! `relon_codegen_cranelift`'s `ConstPool::visit_const_list_*`
3338    //! produces (both backends copy the same blob into the arena
3339    //! prefix; a layout drift on one side silently corrupts the other's
3340    //! cached ET_REL). Both ConstPools are crate-private, so the
3341    //! parity is pinned here against the documented wire layout the
3342    //! cranelift `visit_const_list_*` port was matched to:
3343    //!
3344    //! * int / float: align 8, `[len: u32 LE][pad: u32 zero][i64/f64 LE]`
3345    //! * bool:        align 4, `[len: u32 LE][u8 0/1 tightly packed]`
3346    use super::*;
3347    use relon_ir::ir::{Func, Op, TaggedOp};
3348    use relon_parser::TokenRange;
3349
3350    fn tagged(op: Op) -> TaggedOp {
3351        TaggedOp {
3352            op,
3353            range: TokenRange::default(),
3354        }
3355    }
3356
3357    fn synth_module(body: Vec<TaggedOp>) -> IrModule {
3358        IrModule {
3359            funcs: vec![Func {
3360                name: "run_main".into(),
3361                params: vec![],
3362                ret: IrType::I64,
3363                body,
3364                range: TokenRange::default(),
3365            }],
3366            entry_func_index: Some(0),
3367            imports: vec![],
3368            closure_table: vec![],
3369        }
3370    }
3371
3372    #[test]
3373    fn const_list_int_byte_layout() {
3374        let pool = ConstPool::from_module(&synth_module(vec![tagged(Op::ConstListInt {
3375            idx: 0,
3376            elements: vec![10, 20, 30],
3377        })]))
3378        .unwrap();
3379        assert_eq!(pool.list_int_offsets.get(&0).copied(), Some(0));
3380        // [len:u32=3][pad:4 zero][i64 x3 LE]
3381        assert_eq!(&pool.bytes[0..4], &3u32.to_le_bytes());
3382        assert_eq!(&pool.bytes[4..8], &[0u8; 4]);
3383        assert_eq!(&pool.bytes[8..16], &10i64.to_le_bytes());
3384        assert_eq!(&pool.bytes[16..24], &20i64.to_le_bytes());
3385        assert_eq!(&pool.bytes[24..32], &30i64.to_le_bytes());
3386        assert_eq!(pool.bytes.len(), 32);
3387    }
3388
3389    #[test]
3390    fn const_list_float_byte_layout() {
3391        // f64 elements carried as their u64 LE bit-pattern (matches the
3392        // IR's `ConstListFloat { elements: Vec<u64> }` representation).
3393        let f0 = 1.5f64.to_bits();
3394        let f1 = (-2.0f64).to_bits();
3395        let pool = ConstPool::from_module(&synth_module(vec![tagged(Op::ConstListFloat {
3396            idx: 0,
3397            elements: vec![f0, f1],
3398        })]))
3399        .unwrap();
3400        assert_eq!(pool.list_float_offsets.get(&0).copied(), Some(0));
3401        assert_eq!(&pool.bytes[0..4], &2u32.to_le_bytes());
3402        assert_eq!(&pool.bytes[4..8], &[0u8; 4]);
3403        assert_eq!(&pool.bytes[8..16], &f0.to_le_bytes());
3404        assert_eq!(&pool.bytes[16..24], &f1.to_le_bytes());
3405        assert_eq!(pool.bytes.len(), 24);
3406    }
3407
3408    #[test]
3409    fn const_list_bool_byte_layout() {
3410        let pool = ConstPool::from_module(&synth_module(vec![tagged(Op::ConstListBool {
3411            idx: 0,
3412            elements: vec![true, false, true],
3413        })]))
3414        .unwrap();
3415        assert_eq!(pool.list_bool_offsets.get(&0).copied(), Some(0));
3416        // [len:u32=3][1,0,1] tightly packed, no padding between bytes
3417        assert_eq!(&pool.bytes[0..4], &3u32.to_le_bytes());
3418        assert_eq!(&pool.bytes[4..7], &[1u8, 0, 1]);
3419        assert_eq!(pool.bytes.len(), 7);
3420    }
3421
3422    #[test]
3423    fn const_list_alignment_across_records() {
3424        // A bool record (len 4 + 3 = 7 bytes, align-4 padding to 8)
3425        // followed by an int record must land the int header on an
3426        // 8-byte boundary so the i64 payload is 8-aligned.
3427        let pool = ConstPool::from_module(&synth_module(vec![
3428            tagged(Op::ConstListBool {
3429                idx: 0,
3430                elements: vec![true, false, true],
3431            }),
3432            tagged(Op::ConstListInt {
3433                idx: 1,
3434                elements: vec![42],
3435            }),
3436        ]))
3437        .unwrap();
3438        assert_eq!(pool.list_bool_offsets.get(&0).copied(), Some(0));
3439        // 7 bytes used → align_to(8) pads to offset 8 for the int record.
3440        assert_eq!(pool.list_int_offsets.get(&1).copied(), Some(8));
3441        assert_eq!(&pool.bytes[8..12], &1u32.to_le_bytes());
3442        assert_eq!(&pool.bytes[16..24], &42i64.to_le_bytes());
3443    }
3444
3445    #[test]
3446    fn const_list_string_byte_layout() {
3447        // W5-P2 pointer-array layout. Elements "a","bb","ccc":
3448        //   String records first (4-aligned):
3449        //     off 0:  [slen=1]["a"]            -> 5 bytes, pad to 8
3450        //     off 8:  [slen=2]["bb"]           -> 6 bytes, pad to 16
3451        //     off 16: [slen=3]["ccc"]          -> 7 bytes, pad to 24
3452        //   header at off 24:
3453        //     [len=3][off_0=0][off_1=8][off_2=16]
3454        let pool = ConstPool::from_module(&synth_module(vec![tagged(Op::ConstListString {
3455            idx: 0,
3456            elements: vec!["a".into(), "bb".into(), "ccc".into()],
3457        })]))
3458        .unwrap();
3459        // String record "a" at offset 0.
3460        assert_eq!(&pool.bytes[0..4], &1u32.to_le_bytes());
3461        assert_eq!(&pool.bytes[4..5], b"a");
3462        // "bb" at offset 8 (4-aligned after the 5-byte "a" record).
3463        assert_eq!(&pool.bytes[8..12], &2u32.to_le_bytes());
3464        assert_eq!(&pool.bytes[12..14], b"bb");
3465        // "ccc" at offset 16.
3466        assert_eq!(&pool.bytes[16..20], &3u32.to_le_bytes());
3467        assert_eq!(&pool.bytes[20..23], b"ccc");
3468        // Header at offset 24.
3469        let h = pool.list_string_offsets.get(&0).copied();
3470        assert_eq!(h, Some(24));
3471        assert_eq!(&pool.bytes[24..28], &3u32.to_le_bytes());
3472        assert_eq!(&pool.bytes[28..32], &0u32.to_le_bytes());
3473        assert_eq!(&pool.bytes[32..36], &8u32.to_le_bytes());
3474        assert_eq!(&pool.bytes[36..40], &16u32.to_le_bytes());
3475        assert_eq!(pool.bytes.len(), 40);
3476    }
3477
3478    #[test]
3479    fn duplicate_const_list_idx_is_noop() {
3480        let pool = ConstPool::from_module(&synth_module(vec![
3481            tagged(Op::ConstListInt {
3482                idx: 0,
3483                elements: vec![1, 2],
3484            }),
3485            tagged(Op::ConstListInt {
3486                idx: 0,
3487                elements: vec![1, 2],
3488            }),
3489        ]))
3490        .unwrap();
3491        // One record only: 8 header + 2*8 payload = 24.
3492        assert_eq!(pool.bytes.len(), 24);
3493    }
3494}
3495
3496#[cfg(test)]
3497mod devirt_tests {
3498    //! Soundness unit tests for the W18 closure-devirtualisation
3499    //! capture analysis. These exercise the IR-scan that decides which
3500    //! captures may be stamped `KnownClosure` (→ direct call) vs left as
3501    //! a genuinely-dynamic dispatch (→ runtime switch). Getting this
3502    //! wrong is a silent miscompile, so the analysis is pinned here
3503    //! independent of any end-to-end source.
3504    use super::*;
3505    use relon_ir::ir::{ClosureCapture, Func, IrType, Op, TaggedOp};
3506    use relon_parser::TokenRange;
3507
3508    fn op(o: Op) -> TaggedOp {
3509        TaggedOp {
3510            op: o,
3511            range: TokenRange::default(),
3512        }
3513    }
3514
3515    fn make_closure(fn_table_idx: u32, captures: Vec<ClosureCapture>) -> Op {
3516        let captures_size = captures.iter().map(|c| c.offset + 8).max().unwrap_or(0);
3517        Op::MakeClosure {
3518            fn_table_idx,
3519            captures,
3520            captures_size,
3521        }
3522    }
3523
3524    fn cap(let_idx: u32, offset: u32) -> ClosureCapture {
3525        ClosureCapture {
3526            let_idx,
3527            ty: IrType::Closure,
3528            offset,
3529        }
3530    }
3531
3532    fn entry_with_body(body: Vec<TaggedOp>) -> Func {
3533        Func {
3534            name: "run_main".into(),
3535            params: vec![IrType::I32],
3536            ret: IrType::I32,
3537            body,
3538            range: TokenRange::default(),
3539        }
3540    }
3541
3542    /// A capture of a *known, non-self* closure is recorded so the
3543    /// capturing lambda's body can devirtualise the call against it.
3544    /// Mirrors the W18 predicate `(k) => is_prime(k, 2)` capturing the
3545    /// `is_prime` closure (`fn_table_idx=0`).
3546    #[test]
3547    fn records_known_non_self_capture() {
3548        // let0 := MakeClosure(K=0)  ; the `is_prime` binding
3549        // MakeClosure(L=1) capturing let0 at offset 0 ; the predicate
3550        let body = vec![
3551            op(make_closure(0, vec![cap(0, 0)])), // is_prime self-capture
3552            op(Op::LetSet {
3553                idx: 0,
3554                ty: IrType::Closure,
3555            }),
3556            op(make_closure(1, vec![cap(0, 0)])), // predicate captures is_prime
3557            op(Op::Call {
3558                fn_index: 14,
3559                arg_count: 2,
3560                param_tys: vec![IrType::ListInt, IrType::Closure],
3561                ret_ty: IrType::ListInt,
3562            }),
3563        ];
3564        let entry = entry_with_body(body);
3565        let table = build_known_capture_table(&entry, &[], &[]);
3566        // Lambda L=1 (the predicate) captures known closure K=0 at
3567        // offset 0.
3568        assert_eq!(
3569            table.get(&1).map(Vec::as_slice),
3570            Some(&[(0u32, 0u32)][..]),
3571            "predicate (L=1) must record its is_prime (K=0) capture as known"
3572        );
3573        // L=0 is_prime's own capture is a SELF capture (K==L==0) — it
3574        // must NOT appear here (the self-capture table owns it, and its
3575        // captures_ptr-reuse direct path is strictly better).
3576        assert!(
3577            !table.contains_key(&0),
3578            "self-capture (K==L) must be excluded from the known-capture table"
3579        );
3580    }
3581
3582    /// When a closure let-slot is reassigned to a value that is NOT a
3583    /// literal `MakeClosure` (a genuinely-dynamic closure), the capture
3584    /// must NOT be recorded — the body keeps the runtime switch. This is
3585    /// the correctness red line: devirtualise only a provably-unique
3586    /// callee.
3587    #[test]
3588    fn drops_reassigned_dynamic_closure_slot() {
3589        // let0 := MakeClosure(0)        ; known
3590        // let0 := <some other Closure>  ; reassigned, now dynamic
3591        // MakeClosure(2) capturing let0 ; must NOT be recorded
3592        let body = vec![
3593            op(make_closure(0, vec![cap(0, 0)])),
3594            op(Op::LetSet {
3595                idx: 0,
3596                ty: IrType::Closure,
3597            }),
3598            // A bare `LetSet { Closure }` NOT preceded by a MakeClosure —
3599            // models a closure that arrived from somewhere unprovable
3600            // (a param, a phi, a different binding).
3601            op(Op::LetGet {
3602                idx: 5,
3603                ty: IrType::Closure,
3604            }),
3605            op(Op::LetSet {
3606                idx: 0,
3607                ty: IrType::Closure,
3608            }),
3609            op(make_closure(2, vec![cap(0, 0)])),
3610            op(Op::LetSet {
3611                idx: 9,
3612                ty: IrType::Closure,
3613            }),
3614        ];
3615        let entry = entry_with_body(body);
3616        let table = build_known_capture_table(&entry, &[], &[]);
3617        assert!(
3618            !table.contains_key(&2),
3619            "a capture of a reassigned (dynamic) closure slot must NOT be \
3620             recorded — the call must keep the runtime switch"
3621        );
3622    }
3623
3624    /// The binding `LetSet` that immediately follows a known
3625    /// `MakeClosure` must NOT clear the slot it just established (the
3626    /// ordering bug fixed during development). A later capture of that
3627    /// slot is still recorded.
3628    #[test]
3629    fn binding_letset_does_not_clear_its_own_slot() {
3630        let body = vec![
3631            op(make_closure(3, vec![])),
3632            op(Op::LetSet {
3633                idx: 7,
3634                ty: IrType::Closure,
3635            }),
3636            op(make_closure(4, vec![cap(7, 0)])),
3637            op(Op::LetSet {
3638                idx: 8,
3639                ty: IrType::Closure,
3640            }),
3641        ];
3642        let entry = entry_with_body(body);
3643        let table = build_known_capture_table(&entry, &[], &[]);
3644        assert_eq!(
3645            table.get(&4).map(Vec::as_slice),
3646            Some(&[(0u32, 3u32)][..]),
3647            "L=4 must record its capture of known closure K=3 at offset 0"
3648        );
3649    }
3650}