relon_codegen_llvm/codegen/mod.rs
1//! IR -> LLVM IR lowering.
2//!
3//! Phase B widens the emitter past the Phase A bootstrap envelope:
4//!
5//! - Two entry shapes:
6//! - **Legacy-i64**: `(I64...) -> I64` — driven by
7//! [`LlvmAotEvaluator::from_ir_direct`]. Mirrors the cranelift
8//! crate's same-named envelope; used by the Phase A bootstrap
9//! tests and the side-by-side `from_ir_direct` benchmarks.
10//! - **Buffer-protocol**: `(*state, i32 in_ptr, i32 in_len,
11//! i32 out_ptr, i32 out_cap, i64 caps) -> i32` — driven by
12//! [`LlvmAotEvaluator::from_source`]. Matches what
13//! `lower_workspace_single` emits for every user source.
14//!
15//! - Op set widened to the W1 / W2 production-source surface:
16//! `LocalGet`, `ConstI64` / `ConstI32` / `ConstBool`, `LetGet` /
17//! `LetSet`, `LoadField` / `StoreField` (scalar slots: I32 / I64 /
18//! F64 / Bool / Unit), `Add` / `Sub` / `Mul` / `Div` / `Mod` /
19//! `BitAnd` (`I32` and `I64`), comparison ops (`Eq` / `Ne` /
20//! `Lt` / `Le` / `Gt` / `Ge` — `I32` / `I64` / `Bool` for `Eq`/`Ne`),
21//! structured control flow (`Block` / `Loop` / `Br` / `BrIf` /
22//! `If`), and `Return`.
23//!
24//! Ops outside the Phase B envelope (stdlib `Call`, pointer-indirect
25//! `StoreField`, `MakeClosure`, sandbox-trap helpers, schema-method
26//! dispatch, …) surface as [`crate::LlvmError::Codegen`]. They are
27//! tracked for Phase C.
28//!
29//! ## Control-flow lowering vs cranelift
30//!
31//! Cranelift's `block-with-params` keeps phi nodes implicit (every
32//! branch passes the carried values as block arguments). LLVM IR
33//! requires explicit `phi` nodes per joining basic block. We avoid
34//! both by spilling the IR stack through `alloca` slots whenever
35//! control flow joins, and reading them back on the consumer side.
36//! That mirrors how a naive byte-code-to-LLVM emitter behaves and
37//! relies on LLVM's `mem2reg` pass at -O2/-O3 to turn the alloca
38//! reads back into SSA values + phis. For the W1 / W2 hot loops
39//! `mem2reg` collapses the alloca traffic into a single
40//! loop-carried IR value (verified via `emit_ir_dump`'s output at
41//! `-O2`).
42//!
43//! ## Stack discipline
44//!
45//! The IR's stack machine carries one value per push. We track the
46//! per-op operand stack as `Vec<IntValue>` (every IR value the W1/W2
47//! envelope produces fits in an integer type — I32 for Bool / I32-
48//! tagged values, I64 for I64-tagged values). The wasm-style "every
49//! value above the operand stack is unreachable after `br`" rule
50//! lets us drop unconsumed stack slots silently — LLVM's verifier
51//! catches missing terminators if we forget to seal a block.
52
53use std::collections::HashMap;
54
55use inkwell::builder::Builder;
56use inkwell::context::Context;
57use inkwell::module::{Linkage, Module as LlvmModule};
58use inkwell::types::{BasicMetadataTypeEnum, BasicTypeEnum};
59use inkwell::values::{BasicValue, BasicValueEnum, FunctionValue, IntValue, PointerValue};
60use inkwell::{AddressSpace, IntPredicate};
61
62use relon_ir::ir::{Func, IrType, Module as IrModule, Op, TaggedOp};
63
64use crate::error::LlvmError;
65use crate::state::{ARENA_STATE_OFFSET_BASE, ARENA_STATE_OFFSET_TAIL_CURSOR};
66
67// Per-`Op`-family lowering modules. Each holds an
68// `impl<'ctx, 'b, 'cp> Emit<'ctx, 'b, 'cp>` block with the `emit_*`
69// methods for that family; the exhaustive `lower_op` dispatch below
70// delegates to them. Mirrors the cranelift backend's `codegen/*`
71// split so Phase 0b can fill unimplemented families in place without
72// colliding. (Behavior-preserving reorg — Phase 0a.)
73mod arith;
74mod call;
75mod closure;
76mod collections;
77mod control;
78mod mem;
79mod schema;
80mod string;
81mod unicode;
82
83// Family-local enums consumed by the central `lower_op` dispatch.
84use arith::BinOp;
85use mem::{AbsLoad, AbsStore};
86
87/// Canonical export name the entry function uses in the emitted LLVM
88/// module. The evaluator side `dlsym`s / `get_function`s against this
89/// symbol after JIT finalize, so renaming it requires touching both
90/// crates simultaneously.
91pub(crate) const ENTRY_SYMBOL: &str = "relon_llvm_entry";
92
93/// Tag a `load` instruction with `!invariant.load !{}` so LLVM treats
94/// every load from the address as returning the same value for the
95/// instruction's lifetime — letting GVN/LICM hoist it out of loops and
96/// collapse redundant reloads.
97///
98/// SOUND ONLY for genuinely call-invariant memory. The single caller is
99/// the per-entry / per-lambda `state.arena_base` word load
100/// (`ARENA_STATE_OFFSET_BASE`): the host fills the base pointer into the
101/// `ArenaState` struct *before* the entry runs and never mutates it for
102/// the call's duration (only the scratch / tail cursors at later offsets
103/// are written — see `state.rs`; no `build_store` ever targets offset 0).
104/// Without this tag LLVM reloads the base from the opaque state pointer on
105/// every arena access inside a loop (the W20 n-body inner loop showed a
106/// `mov (%state), %base` reload per pair access), because it cannot prove
107/// the intervening arena stores don't alias the state struct. The tag is
108/// metadata only — it changes no value, so every backend stays
109/// bit-identical.
110fn mark_invariant_load(ctx: &Context, loaded: BasicValueEnum<'_>) {
111 if let Some(inst) = loaded.as_instruction_value() {
112 let kind_id = ctx.get_kind_id("invariant.load");
113 let empty = ctx.metadata_node(&[]);
114 let _ = inst.set_metadata(empty, kind_id);
115 }
116}
117
118/// Phase D.1 dispatch-boundary fast path: a second exported entry
119/// emitted alongside the buffer-protocol entry whenever the source's
120/// `#main(Int...) -> Int` shape qualifies. Skips the HashMap pack +
121/// arena round-trip the buffer envelope incurs, dropping the per-call
122/// boundary cost from the ~650 ns band into the rust-native ballpark.
123///
124/// Only resolved when the evaluator's [`FastPathProfile`] is `Some`;
125/// the symbol is absent from the JIT module otherwise.
126pub(crate) const ENTRY_SYMBOL_FAST: &str = "relon_llvm_entry_fast";
127
128/// Which signature the LLVM emitter should generate. Mirrors the
129/// cranelift crate's `EntryShape` enum so a side-by-side comparison
130/// of the two backends shares the same vocabulary.
131#[derive(Debug, Clone, Copy, PartialEq, Eq)]
132pub(crate) enum EntryShape {
133 /// `(I64...) -> I64`. The Phase A bootstrap envelope — used by
134 /// `from_ir_direct` callers (tests, helloworld_arith fixtures).
135 LegacyI64,
136 /// `(*state, i32 in_ptr, i32 in_len, i32 out_ptr, i32 out_cap,
137 /// i64 caps) -> i32`. The shape `lower_workspace_single`
138 /// synthesises for every user `#main` source. State is the
139 /// first parameter to match the cranelift backend's
140 /// `BufferEntryFn` layout.
141 Buffer,
142}
143
144/// Stage 1.B: whether `Op::CallNative` lowers to **open-world**
145/// dynamic dispatch (the `relon_llvm_call_native` helper resolved at
146/// runtime via `add_global_mapping`) or **closed-world** static
147/// dispatch (a direct `call @<host_symbol>` to an `extern` declaration
148/// the LTO co-compile step later links + inlines).
149///
150/// `OpenWorld` is the default and the only path MCJIT / `from_source`
151/// ever uses — it must stay reachable verbatim. `ClosedWorld` is only
152/// selected by the co-compile orchestration (`crate::cocompile`) when
153/// the full host-fn set is known at emit time (the build.rs /
154/// `emit_object` path), mirroring cranelift's *static* `cap_lookup ->
155/// fn_ptr` arm rather than its `_dynamic` helper arm.
156#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
157pub enum WorldMode {
158 /// Dynamic dispatch through `relon_llvm_call_native`. Default so
159 /// the existing MCJIT / `from_source` path is untouched.
160 #[default]
161 OpenWorld,
162 /// Static `call @<host_symbol>` to an external declaration. The
163 /// host bitcode is linked in + inlined by the LTO co-compile pass.
164 ClosedWorld,
165}
166
167/// Phase D.1 fast-path profile: describes a `#main(Int...) -> Int`
168/// source shape eligible for the typed legacy-i64 dispatch fast path.
169///
170/// The profile maps each declared `#main` Int parameter's buffer
171/// offset to the LLVM fast entry's i64 positional slot, and records
172/// the offset of the single Int return slot so the trailing
173/// `StoreField` can be rewritten into a `ret`. Used exclusively by
174/// [`emit_fast_entry`].
175#[derive(Debug, Clone)]
176pub(crate) struct FastPathProfile {
177 /// One entry per declared `#main` arg: the field's byte offset in
178 /// the input buffer (matches what `LoadField { offset }` carries
179 /// in the IR body) and the i64 slot index in the fast entry
180 /// signature. Vector order parallels schema declaration order.
181 pub(crate) arg_offsets: Vec<u32>,
182 /// Byte offset of the single `value` field in the return buffer.
183 /// The trailing `StoreField { offset, ty: I64 }` whose offset
184 /// matches this value gets rewritten into a `ret` on the value
185 /// (after popping the IR stack normally). Any other `StoreField`
186 /// surfaces as an emitter error — the fast path only handles
187 /// single-value-wrapper returns.
188 pub(crate) ret_offset: u32,
189}
190
191/// Phase E.1: per-module const-pool blob laid out at compile time and
192/// copied into the arena prefix on every dispatch. Mirrors
193/// `relon_codegen_cranelift::codegen::ConstPool` (shape only — the LLVM
194/// side keeps it scoped to this crate so the dep direction stays
195/// one-way).
196///
197/// Layout: `[len: u32 LE][utf8 bytes]` records emitted in IR-walk
198/// order, aligned to 4. Each `Op::ConstString { idx }` resolves to
199/// `string_offsets[idx]` — the byte offset of its record inside
200/// [`Self::bytes`] (= the arena-relative offset once the host has
201/// copied the blob to the arena prefix).
202#[derive(Debug, Default, Clone)]
203pub struct ConstPool {
204 /// `idx -> byte offset within `bytes`. The emitter materialises
205 /// `Op::ConstString { idx }` as `iconst(I32, string_offsets[idx])`.
206 pub string_offsets: std::collections::HashMap<u32, u32>,
207 /// `List<Int>` pool: `idx -> byte offset`. Mirrors cranelift's
208 /// `ConstPool::list_int_offsets`; record layout is
209 /// `[len: u32 LE][pad: u32][i64 elements LE]`, aligned to 8.
210 pub list_int_offsets: std::collections::HashMap<u32, u32>,
211 /// `List<Float>` pool: `idx -> byte offset`. Same layout as
212 /// `list_int_offsets` (f64 elements stored as their u64 LE
213 /// bit-pattern), aligned to 8.
214 pub list_float_offsets: std::collections::HashMap<u32, u32>,
215 /// `List<Bool>` pool: `idx -> byte offset`. Record layout is
216 /// `[len: u32 LE][u8 booleans]` (tightly packed), aligned to 4.
217 pub list_bool_offsets: std::collections::HashMap<u32, u32>,
218 /// W5-P2: `List<String>` pointer-array pool: `idx -> header byte
219 /// offset`. Record layout (byte-identical to cranelift's
220 /// `visit_const_list_string`): each element's `[slen: u32 LE][utf8]`
221 /// String record is emitted first (4-aligned), then the header
222 /// `[len: u32 LE][off_0: u32 LE]...[off_{N-1}: u32 LE]` whose
223 /// `off_i` is the arena-relative offset of String record `i`.
224 pub list_string_offsets: std::collections::HashMap<u32, u32>,
225 /// W5-P1/P3: `{String -> Int}` dict pool: `idx -> record byte
226 /// offset`. Record layout (byte-identical to cranelift's
227 /// `visit_const_dict`): `[entry_count: u32 LE][pad: u32][shape_hash:
228 /// u64 LE]` header, a `[key_off: u32][key_len: u32][value: i64]`
229 /// entry table sorted by key bytes, then the concatenated UTF-8 key
230 /// payload (`key_off` record-relative). The W5-P3 dict-get probe
231 /// binary-/linear-searches this table at runtime.
232 pub dict_offsets: std::collections::HashMap<u32, u32>,
233 /// Wave R14: Unicode `*TableAddr` pool. Each distinct
234 /// [`unicode::UnicodeTable`] referenced anywhere in the module (incl.
235 /// inlined bundled-stdlib helper bodies) is encoded once via the
236 /// shared `relon_ir` encoders and laid into [`Self::bytes`]; the
237 /// `*TableAddr` op resolves to the recorded arena-relative offset.
238 /// Byte-identical to cranelift's per-table `ConstPool` slots.
239 pub(crate) unicode_table_offsets: std::collections::HashMap<unicode::UnicodeTable, u32>,
240 /// Materialised bytes in record order. The host trampoline copies
241 /// these verbatim to `arena[..bytes.len()]` before every dispatch.
242 pub bytes: Vec<u8>,
243}
244
245impl ConstPool {
246 /// Build the pool by walking every function body in `module` and
247 /// collecting each unique `Op::ConstString { idx, value }`. Records
248 /// are laid out in walk-order with 4-byte alignment.
249 pub fn from_module(module: &IrModule) -> Result<Self, LlvmError> {
250 let mut pool = ConstPool::default();
251 for func in &module.funcs {
252 pool.collect_body(&func.body)?;
253 }
254 Ok(pool)
255 }
256
257 fn collect_body(&mut self, body: &[TaggedOp]) -> Result<(), LlvmError> {
258 for tagged in body {
259 self.collect_op(&tagged.op)?;
260 }
261 Ok(())
262 }
263
264 fn collect_op(&mut self, op: &Op) -> Result<(), LlvmError> {
265 match op {
266 Op::ConstString { idx, value } => self.add_string(*idx, value),
267 Op::ConstListInt { idx, elements } => self.add_list_int(*idx, elements),
268 Op::ConstListFloat { idx, elements } => self.add_list_float(*idx, elements),
269 Op::ConstListBool { idx, elements } => self.add_list_bool(*idx, elements),
270 Op::ConstListString { idx, elements } => self.add_list_string(*idx, elements),
271 Op::ConstDict { idx, entries } => self.add_dict(*idx, entries),
272 Op::Block { body, .. } | Op::Loop { body, .. } => self.collect_body(body),
273 Op::If {
274 then_body,
275 else_body,
276 ..
277 } => {
278 self.collect_body(then_body)?;
279 self.collect_body(else_body)
280 }
281 // Op::Call inlines a bundled-stdlib body whose own
282 // `Op::ConstString` literals must also land in the pool —
283 // mirror cranelift's recursion through `builtin_stdlib`.
284 Op::Call { fn_index, .. } => {
285 let stdlib = relon_ir::stdlib::builtin_stdlib();
286 if let Some(callee) = stdlib.get(*fn_index as usize) {
287 let body = callee.body_owned();
288 self.collect_body(&body)?;
289 }
290 Ok(())
291 }
292 // Wave R14: Unicode `*TableAddr` ops. Lay each referenced
293 // table into the const prefix once (deduped by table identity)
294 // so the lowering resolves to a fixed offset instead of
295 // copying the table into scratch per op-execution.
296 other => {
297 if let Some(table) = unicode::UnicodeTable::from_op(other) {
298 self.add_unicode_table(table)?;
299 }
300 Ok(())
301 }
302 }
303 }
304
305 /// Lay `table`'s encoded bytes into the pool on first reference and
306 /// record the arena-relative offset. The byte encoder is the exact
307 /// shared `relon_ir` function cranelift's `ConstPool` calls, so the
308 /// data a lookup helper reads is byte-identical across backends.
309 /// Aligned to 4 to match every `*TableAddr` slot on the cranelift
310 /// side (the table headers are read with 4-byte-aligned i32 loads).
311 fn add_unicode_table(&mut self, table: unicode::UnicodeTable) -> Result<(), LlvmError> {
312 if self.unicode_table_offsets.contains_key(&table) {
313 return Ok(());
314 }
315 self.align_to(4);
316 let off = u32::try_from(self.bytes.len())
317 .map_err(|_| LlvmError::Codegen("llvm const pool exceeds u32 range".into()))?;
318 let bytes = table.encode_bytes();
319 self.bytes.extend_from_slice(&bytes);
320 self.unicode_table_offsets.insert(table, off);
321 Ok(())
322 }
323
324 fn add_string(&mut self, idx: u32, value: &str) -> Result<(), LlvmError> {
325 if self.string_offsets.contains_key(&idx) {
326 return Ok(());
327 }
328 // Align to 4 so the `[len: u32]` header lands on a 4-byte
329 // boundary — i32 loads through the JIT use `align=4` and we
330 // don't want an unaligned trap on hosts where it matters.
331 self.align_to(4);
332 let off = u32::try_from(self.bytes.len())
333 .map_err(|_| LlvmError::Codegen("llvm const pool exceeds u32 range".into()))?;
334 let len = u32::try_from(value.len())
335 .map_err(|_| LlvmError::Codegen("ConstString length exceeds u32 range".into()))?;
336 self.bytes.extend_from_slice(&len.to_le_bytes());
337 self.bytes.extend_from_slice(value.as_bytes());
338 self.string_offsets.insert(idx, off);
339 Ok(())
340 }
341
342 /// Pad `bytes` up to the next `align` boundary with zero fill.
343 /// Mirrors cranelift's `ConstPool::align_to`.
344 fn align_to(&mut self, align: usize) {
345 let rem = self.bytes.len() % align;
346 if rem != 0 {
347 self.bytes.resize(self.bytes.len() + (align - rem), 0);
348 }
349 }
350
351 /// Lay out a `List<Int>` record. Byte layout
352 /// `[len: u32 LE][pad: u32 zero][i64 elements LE]`, aligned to 8 —
353 /// byte-identical to cranelift's `visit_const_list_int` (cross-
354 /// backend arena data contract).
355 fn add_list_int(&mut self, idx: u32, elements: &[i64]) -> Result<(), LlvmError> {
356 if self.list_int_offsets.contains_key(&idx) {
357 return Ok(());
358 }
359 self.align_to(8);
360 let off = u32::try_from(self.bytes.len())
361 .map_err(|_| LlvmError::Codegen("llvm const pool exceeds u32 range".into()))?;
362 let len = u32::try_from(elements.len())
363 .map_err(|_| LlvmError::Codegen("ConstListInt length exceeds u32 range".into()))?;
364 self.bytes.extend_from_slice(&len.to_le_bytes());
365 self.bytes.extend_from_slice(&[0u8; 4]); // pad to 8
366 for e in elements {
367 self.bytes.extend_from_slice(&e.to_le_bytes());
368 }
369 self.list_int_offsets.insert(idx, off);
370 Ok(())
371 }
372
373 /// Lay out a `List<Float>` record. Same layout as `add_list_int`
374 /// (f64 elements stored as their u64 LE bit-pattern), aligned to 8 —
375 /// byte-identical to cranelift's `visit_const_list_float`.
376 fn add_list_float(&mut self, idx: u32, elements: &[u64]) -> Result<(), LlvmError> {
377 if self.list_float_offsets.contains_key(&idx) {
378 return Ok(());
379 }
380 self.align_to(8);
381 let off = u32::try_from(self.bytes.len())
382 .map_err(|_| LlvmError::Codegen("llvm const pool exceeds u32 range".into()))?;
383 let len = u32::try_from(elements.len())
384 .map_err(|_| LlvmError::Codegen("ConstListFloat length exceeds u32 range".into()))?;
385 self.bytes.extend_from_slice(&len.to_le_bytes());
386 self.bytes.extend_from_slice(&[0u8; 4]); // pad to 8
387 for e in elements {
388 self.bytes.extend_from_slice(&e.to_le_bytes());
389 }
390 self.list_float_offsets.insert(idx, off);
391 Ok(())
392 }
393
394 /// Lay out a `List<Bool>` record. Byte layout
395 /// `[len: u32 LE][u8 booleans]` (tightly packed), aligned to 4 —
396 /// byte-identical to cranelift's `visit_const_list_bool`.
397 fn add_list_bool(&mut self, idx: u32, elements: &[bool]) -> Result<(), LlvmError> {
398 if self.list_bool_offsets.contains_key(&idx) {
399 return Ok(());
400 }
401 self.align_to(4);
402 let off = u32::try_from(self.bytes.len())
403 .map_err(|_| LlvmError::Codegen("llvm const pool exceeds u32 range".into()))?;
404 let len = u32::try_from(elements.len())
405 .map_err(|_| LlvmError::Codegen("ConstListBool length exceeds u32 range".into()))?;
406 self.bytes.extend_from_slice(&len.to_le_bytes());
407 for e in elements {
408 self.bytes.push(if *e { 1 } else { 0 });
409 }
410 self.list_bool_offsets.insert(idx, off);
411 Ok(())
412 }
413
414 /// W5-P2: lay out a `List<String>` pointer-array record. Each
415 /// element's `[slen: u32 LE][utf8]` String record is emitted first
416 /// (4-aligned), then the header `[len: u32 LE][off_0: u32 LE]...`
417 /// whose `off_i` is the arena-relative offset of String record `i`.
418 /// Byte-identical to cranelift's `visit_const_list_string` (cross-
419 /// backend arena data contract); the `idx -> header offset` map is
420 /// what `Op::ConstListString` resolves to.
421 fn add_list_string(&mut self, idx: u32, elements: &[String]) -> Result<(), LlvmError> {
422 if self.list_string_offsets.contains_key(&idx) {
423 return Ok(());
424 }
425 self.align_to(4);
426 let mut str_offsets: Vec<u32> = Vec::with_capacity(elements.len());
427 for s in elements {
428 self.align_to(4);
429 let s_off = u32::try_from(self.bytes.len()).map_err(|_| {
430 LlvmError::Codegen("ConstListString string offset exceeds u32".into())
431 })?;
432 let slen = u32::try_from(s.len()).map_err(|_| {
433 LlvmError::Codegen("ConstListString element length exceeds u32".into())
434 })?;
435 self.bytes.extend_from_slice(&slen.to_le_bytes());
436 self.bytes.extend_from_slice(s.as_bytes());
437 str_offsets.push(s_off);
438 }
439 self.align_to(4);
440 let header_off = u32::try_from(self.bytes.len())
441 .map_err(|_| LlvmError::Codegen("ConstListString header offset exceeds u32".into()))?;
442 let len = u32::try_from(elements.len())
443 .map_err(|_| LlvmError::Codegen("ConstListString length exceeds u32".into()))?;
444 self.bytes.extend_from_slice(&len.to_le_bytes());
445 for off in &str_offsets {
446 self.bytes.extend_from_slice(&off.to_le_bytes());
447 }
448 self.list_string_offsets.insert(idx, header_off);
449 Ok(())
450 }
451
452 /// W5-P1/P3: lay out a `{String -> Int}` dict record. Byte-identical
453 /// to cranelift's `const_pool::visit_const_dict` (cross-backend
454 /// arena data contract) so the W5-P3 dict-get probe reads the same
455 /// bytes on either backend:
456 ///
457 /// ```text
458 /// [entry_count: u32 LE][pad: u32][shape_hash: u64 LE] ; 16-byte header
459 /// entry_count × [key_off: u32 LE][key_len: u32 LE][value: i64 LE]
460 /// concatenated UTF-8 key bytes ; key_off record-rel
461 /// ```
462 ///
463 /// The entry table is sorted by key bytes (deterministic + probe-
464 /// friendly); the record start is 8-aligned so the i64 values + the
465 /// u64 shape_hash land on natural boundaries.
466 fn add_dict(&mut self, idx: u32, entries: &[(String, i64)]) -> Result<(), LlvmError> {
467 if self.dict_offsets.contains_key(&idx) {
468 return Ok(());
469 }
470 self.align_to(8);
471 let off = u32::try_from(self.bytes.len())
472 .map_err(|_| LlvmError::Codegen("llvm const pool exceeds u32 range".into()))?;
473
474 let mut sorted: Vec<&(String, i64)> = entries.iter().collect();
475 sorted.sort_by(|a, b| a.0.as_bytes().cmp(b.0.as_bytes()));
476
477 let entry_count = u32::try_from(sorted.len())
478 .map_err(|_| LlvmError::Codegen("ConstDict entry count exceeds u32".into()))?;
479 let shape_hash =
480 relon_ir::shape_hash::shape_hash_for_keys(sorted.iter().map(|(k, _)| k.as_str()));
481
482 // Header.
483 self.bytes.extend_from_slice(&entry_count.to_le_bytes());
484 self.bytes.extend_from_slice(&[0u8; 4]); // pad: keep shape_hash 8-aligned
485 self.bytes.extend_from_slice(&shape_hash.to_le_bytes());
486
487 const HEADER_BYTES: u32 = 16;
488 const ENTRY_BYTES: u32 = 16;
489 let table_bytes = entry_count
490 .checked_mul(ENTRY_BYTES)
491 .ok_or_else(|| LlvmError::Codegen("ConstDict table size overflow".into()))?;
492 let key_payload_base = HEADER_BYTES
493 .checked_add(table_bytes)
494 .ok_or_else(|| LlvmError::Codegen("ConstDict key base overflow".into()))?;
495
496 // Entry table. key_off is record-relative; accumulate as we go.
497 let mut running_key_off = key_payload_base;
498 for (key, value) in &sorted {
499 let key_len = u32::try_from(key.len())
500 .map_err(|_| LlvmError::Codegen("ConstDict key length exceeds u32".into()))?;
501 self.bytes.extend_from_slice(&running_key_off.to_le_bytes());
502 self.bytes.extend_from_slice(&key_len.to_le_bytes());
503 self.bytes.extend_from_slice(&value.to_le_bytes());
504 running_key_off = running_key_off
505 .checked_add(key_len)
506 .ok_or_else(|| LlvmError::Codegen("ConstDict key offset overflow".into()))?;
507 }
508
509 // Key payload.
510 for (key, _) in &sorted {
511 self.bytes.extend_from_slice(key.as_bytes());
512 }
513
514 self.dict_offsets.insert(idx, off);
515 Ok(())
516 }
517}
518
519/// IR param signature that triggers [`EntryShape::Buffer`]. Mirrors
520/// `is_buffer_protocol_signature` on the cranelift side.
521pub(crate) fn is_buffer_protocol_signature(params: &[IrType], ret: IrType) -> bool {
522 matches!(
523 params,
524 [
525 IrType::I32,
526 IrType::I32,
527 IrType::I32,
528 IrType::I32,
529 IrType::I64
530 ]
531 ) && matches!(ret, IrType::I32)
532}
533
534/// Phase E.2 multi-function emit: lower every reachable IR function
535/// into LLVM. The entry function `entry` is emitted under either the
536/// legacy-i64 or buffer-protocol shape; each entry in `helpers` is
537/// emitted as a sibling helper function with a plain typed
538/// `(params...) -> ret` signature so the entry's `Op::Call` lowering
539/// can route to it through a direct LLVM `call` instruction.
540///
541/// `helper_ir_indices` parallels `helpers`: entry `i` carries the
542/// IR-side `funcs` index for the matching helper. Used by the
543/// `Op::Call` lowering to resolve `fn_index - stdlib_count` back to the
544/// matching `FunctionValue`.
545///
546/// Phase F.W7 widens the surface to closures-as-values:
547///
548/// - `lambdas` carries the IR funcs the lowering pass appended to the
549/// module's closure table (`#main`-side `fib: (k) => ...` lifts to a
550/// lambda Func). Each lambda is declared / emitted with the
551/// signature `(state, captures_ptr, ...lambda.params[1..]) -> ret`
552/// so the body's `LocalGet(0)` reads the captures_ptr arg, and so
553/// `Op::AllocScratch` / `*AtAbsolute` ops inside the body can reach
554/// the per-call arena state.
555/// - `closure_table` mirrors the IR's `Module::closure_table` so the
556/// emitter knows which `fn_table_idx` resolves to which lambda
557/// `FunctionValue`. Returned alongside `helper_table` so the
558/// `Op::MakeClosure` / `Op::CallClosure` lowering can refer to it.
559///
560/// `const_pool` ships the per-module ConstString blob the entry +
561/// helper bodies index into via `Op::ConstString { idx }`. The host
562/// copies `const_pool.bytes` to the arena prefix before every
563/// dispatch so the materialised `iconst(I32, offset)` resolves to a
564/// stable address.
565///
566/// Returns the entry `FunctionValue`, the detected entry shape, the
567/// helper lookup table the `Emit` driver hands off to the per-function
568/// lowering so sibling calls can find their callee, and the closure
569/// table (one entry per `fn_table_idx`, in source order).
570/// Open-world entry point (the only one MCJIT / `from_source` use).
571/// `Op::CallNative` lowers to the dynamic `relon_llvm_call_native`
572/// helper. Signature kept stable so the `evaluator.rs` call sites are
573/// untouched.
574#[allow(clippy::too_many_arguments, clippy::type_complexity)]
575pub(crate) fn emit_module_funcs<'ctx>(
576 ctx: &'ctx Context,
577 module: &LlvmModule<'ctx>,
578 entry: &Func,
579 buffer_return_size: u32,
580 const_pool: &ConstPool,
581 helpers: &[&Func],
582 helper_ir_indices: Option<&[u32]>,
583 lambdas: &[&Func],
584 closure_table: &[u32],
585 imports: &[relon_ir::ir::NativeImport],
586) -> Result<
587 (
588 FunctionValue<'ctx>,
589 EntryShape,
590 HashMap<u32, FunctionValue<'ctx>>,
591 Vec<FunctionValue<'ctx>>,
592 ),
593 LlvmError,
594> {
595 emit_module_funcs_impl(
596 ctx,
597 module,
598 entry,
599 buffer_return_size,
600 const_pool,
601 helpers,
602 helper_ir_indices,
603 lambdas,
604 closure_table,
605 imports,
606 WorldMode::OpenWorld,
607 crate::CodegenTarget::Native,
608 &[],
609 )
610}
611
612/// P3 §2.2 wasm32 entry point. Same open-world dispatch as
613/// [`emit_module_funcs`] but targets wasm32 so an `Op::CallNative`
614/// lowers to a **wasm import** call ([`crate::wasi_host`]) instead of the
615/// native `relon_llvm_call_native` MCJIT helper. Used only by the
616/// `emit_object_for_target(.., CodegenTarget::Wasm32)` object-emit path.
617#[allow(clippy::too_many_arguments, clippy::type_complexity)]
618pub(crate) fn emit_module_funcs_wasm<'ctx>(
619 ctx: &'ctx Context,
620 module: &LlvmModule<'ctx>,
621 entry: &Func,
622 buffer_return_size: u32,
623 const_pool: &ConstPool,
624 helpers: &[&Func],
625 helper_ir_indices: Option<&[u32]>,
626 lambdas: &[&Func],
627 closure_table: &[u32],
628 imports: &[relon_ir::ir::NativeImport],
629) -> Result<
630 (
631 FunctionValue<'ctx>,
632 EntryShape,
633 HashMap<u32, FunctionValue<'ctx>>,
634 Vec<FunctionValue<'ctx>>,
635 ),
636 LlvmError,
637> {
638 emit_module_funcs_impl(
639 ctx,
640 module,
641 entry,
642 buffer_return_size,
643 const_pool,
644 helpers,
645 helper_ir_indices,
646 lambdas,
647 closure_table,
648 imports,
649 WorldMode::OpenWorld,
650 crate::CodegenTarget::Wasm32,
651 &[],
652 )
653}
654
655/// Stage 1.B closed-world entry point. `Op::CallNative` lowers to a
656/// static `call @<host_symbol>` against an `extern` declaration; the
657/// host bitcode is linked in + inlined by [`crate::cocompile`]. Used
658/// only by the co-compile orchestration — never by MCJIT / `from_source`.
659#[allow(clippy::too_many_arguments, clippy::type_complexity)]
660pub(crate) fn emit_module_funcs_closed_world<'ctx>(
661 ctx: &'ctx Context,
662 module: &LlvmModule<'ctx>,
663 entry: &Func,
664 buffer_return_size: u32,
665 const_pool: &ConstPool,
666 helpers: &[&Func],
667 helper_ir_indices: Option<&[u32]>,
668 lambdas: &[&Func],
669 closure_table: &[u32],
670 imports: &[relon_ir::ir::NativeImport],
671) -> Result<
672 (
673 FunctionValue<'ctx>,
674 EntryShape,
675 HashMap<u32, FunctionValue<'ctx>>,
676 Vec<FunctionValue<'ctx>>,
677 ),
678 LlvmError,
679> {
680 emit_module_funcs_impl(
681 ctx,
682 module,
683 entry,
684 buffer_return_size,
685 const_pool,
686 helpers,
687 helper_ir_indices,
688 lambdas,
689 closure_table,
690 imports,
691 WorldMode::ClosedWorld,
692 crate::CodegenTarget::Native,
693 &[],
694 )
695}
696
697/// P3 §2.2 wasm closed-world co-compile entry point. Like
698/// [`emit_module_funcs_closed_world`] but targets **wasm32**: a
699/// pure-compute `Op::CallNative` (an import whose `effectful_imports`
700/// flag is `false`) lowers to a direct `call @<host_symbol>` that the
701/// wasm host-shim co-compile ([`crate::cocompile::link_and_inline_host_shim_wasm`])
702/// links + inlines into the wasm unit, mirroring the native closed-world
703/// inline. An **effectful** import (flag `true` — capability-gated by a
704/// preceding `Op::CheckCap`) instead routes to a **wasm import** call
705/// ([`crate::wasi_host`]) so the side effect crosses the sandbox boundary
706/// back out to the trusted host (ADR §2.2: pure inline, effectful → WASI).
707///
708/// `effectful_imports[i]` is the per-`import_idx` effectful flag; the
709/// caller (`emit_object_for_target`) derives it from the IR's
710/// CheckCap → CallNative shape.
711#[allow(clippy::too_many_arguments, clippy::type_complexity)]
712pub(crate) fn emit_module_funcs_closed_world_wasm<'ctx>(
713 ctx: &'ctx Context,
714 module: &LlvmModule<'ctx>,
715 entry: &Func,
716 buffer_return_size: u32,
717 const_pool: &ConstPool,
718 helpers: &[&Func],
719 helper_ir_indices: Option<&[u32]>,
720 lambdas: &[&Func],
721 closure_table: &[u32],
722 imports: &[relon_ir::ir::NativeImport],
723 effectful_imports: &[bool],
724) -> Result<
725 (
726 FunctionValue<'ctx>,
727 EntryShape,
728 HashMap<u32, FunctionValue<'ctx>>,
729 Vec<FunctionValue<'ctx>>,
730 ),
731 LlvmError,
732> {
733 emit_module_funcs_impl(
734 ctx,
735 module,
736 entry,
737 buffer_return_size,
738 const_pool,
739 helpers,
740 helper_ir_indices,
741 lambdas,
742 closure_table,
743 imports,
744 WorldMode::ClosedWorld,
745 crate::CodegenTarget::Wasm32,
746 effectful_imports,
747 )
748}
749
750#[allow(clippy::too_many_arguments, clippy::type_complexity)]
751fn emit_module_funcs_impl<'ctx>(
752 ctx: &'ctx Context,
753 module: &LlvmModule<'ctx>,
754 entry: &Func,
755 buffer_return_size: u32,
756 const_pool: &ConstPool,
757 helpers: &[&Func],
758 helper_ir_indices: Option<&[u32]>,
759 lambdas: &[&Func],
760 closure_table: &[u32],
761 imports: &[relon_ir::ir::NativeImport],
762 world_mode: WorldMode,
763 target: crate::CodegenTarget,
764 effectful_imports: &[bool],
765) -> Result<
766 (
767 FunctionValue<'ctx>,
768 EntryShape,
769 HashMap<u32, FunctionValue<'ctx>>,
770 Vec<FunctionValue<'ctx>>,
771 ),
772 LlvmError,
773> {
774 // Step 0: declare module-level intrinsics. `llvm.trap` is shared
775 // by every Div / Mod sandbox guard so a single declaration covers
776 // every per-op guard across every emitted function.
777 declare_llvm_trap(ctx, module);
778
779 // Step 1: declare every helper up-front so the entry / sibling
780 // bodies can resolve forward references (mutual recursion, the
781 // `fib(n - 1) + fib(n - 2)` self-call). LLVM is happy to issue
782 // `call @foo` against a declared-only function; the body is
783 // attached on the second pass.
784 let mut helper_table: HashMap<u32, FunctionValue<'ctx>> = HashMap::new();
785 if let Some(ir_indices) = helper_ir_indices {
786 if ir_indices.len() != helpers.len() {
787 return Err(LlvmError::Codegen(format!(
788 "emit_module_funcs: helpers.len()={} but helper_ir_indices.len()={}",
789 helpers.len(),
790 ir_indices.len()
791 )));
792 }
793 }
794 for (i, helper) in helpers.iter().enumerate() {
795 let fv = declare_helper_function(ctx, module, helper, i)?;
796 let ir_idx = helper_ir_indices.map(|v| v[i]).unwrap_or(i as u32);
797 helper_table.insert(ir_idx, fv);
798 }
799
800 // Phase F.W7: declare every lambda function up-front. Lambdas use
801 // a widened signature `(state, ...lambda.params) -> ret` — the
802 // first IR param (already `IrType::I32`, the captures_ptr the IR
803 // lowering pass prepended in `lower_closure_as_value`) becomes
804 // LLVM param 1 (just past the implicit `*state`). Subsequent
805 // user params shift to LLVM param indices 2.. so the body's
806 // `LocalGet(idx)` resolves to LLVM param `idx + 1`
807 // (`param_base = 1`).
808 let mut closure_fn_table: Vec<FunctionValue<'ctx>> = Vec::with_capacity(closure_table.len());
809 if lambdas.len() != closure_table.len() {
810 return Err(LlvmError::Codegen(format!(
811 "emit_module_funcs: lambdas.len()={} but closure_table.len()={}",
812 lambdas.len(),
813 closure_table.len()
814 )));
815 }
816 for (slot, lambda) in lambdas.iter().enumerate() {
817 let fv = declare_lambda_function(ctx, module, lambda, slot)?;
818 closure_fn_table.push(fv);
819 }
820
821 // Step 2: emit the entry function body.
822 let (entry_fn, shape) = if is_buffer_protocol_signature(&entry.params, entry.ret) {
823 let fv = emit_buffer_entry_with_helpers_and_closures(
824 ctx,
825 module,
826 entry,
827 buffer_return_size,
828 const_pool,
829 &helper_table,
830 &closure_fn_table,
831 imports,
832 world_mode,
833 target,
834 effectful_imports,
835 )?;
836 (fv, EntryShape::Buffer)
837 } else {
838 // The legacy-i64 entry shape covers hand-built fixtures only; it
839 // never references ConstString and supplies its own empty pool
840 // inside `emit_legacy_entry_impl`.
841 let fv =
842 emit_legacy_entry_with_helpers(ctx, module, entry, &helper_table, imports, world_mode)?;
843 (fv, EntryShape::LegacyI64)
844 };
845
846 // Step 3: emit each helper body now that every callee is declared.
847 for helper in helpers.iter() {
848 let helper_fn = helper_table
849 .values()
850 .find(|fv| {
851 // Locate the FunctionValue by name; cheap enough — the
852 // helper table is tiny and the find runs once per
853 // helper.
854 let expected = format!("relon_helper_{}", helper.name);
855 fv.get_name().to_string_lossy() == expected
856 })
857 .copied()
858 .ok_or_else(|| {
859 LlvmError::Codegen(format!(
860 "emit_module_funcs: helper `{}` declared but FunctionValue missing",
861 helper.name
862 ))
863 })?;
864 emit_helper_body(ctx, module, helper, helper_fn, const_pool, &helper_table)?;
865 }
866
867 // Step 4 (Phase F.W7): emit each lambda body. Lambdas share the
868 // `helper_table` so the body can route an inner `Op::Call` to a
869 // sibling helper (Phase E.2 cross-call). They also share the
870 // `closure_fn_table` so a nested `Op::MakeClosure` resolves the
871 // matching lambda FunctionValue from its `fn_table_idx`.
872 //
873 // Build the module-wide self-capture table once before emitting
874 // lambda bodies. The table maps each lambda's `fn_table_idx` to
875 // the captures-struct offsets that hold self-recursive handles
876 // (i.e. handles whose `captures_ptr` field equals the lambda's
877 // own captures_ptr arg). The lambda-body emit uses this table to
878 // stamp [`Provenance::OwnCaptureHandle`] on the matching capture
879 // loads so the recursive call site can pick the direct-call fast
880 // path. Empty for modules that have no self-recursive closures.
881 let self_capture_table = build_self_capture_table(entry, helpers, lambdas);
882 // Devirtualisation (W18): companion table for captures of known
883 // (non-self) closures — lets the W18 predicate's `is_prime` call
884 // devirtualise inside the predicate lambda body.
885 let known_capture_table = build_known_capture_table(entry, helpers, lambdas);
886 for (slot, lambda) in lambdas.iter().enumerate() {
887 let lambda_fn = closure_fn_table[slot];
888 let slot_u32 = slot as u32;
889 let offsets = self_capture_table
890 .get(&slot_u32)
891 .cloned()
892 .unwrap_or_default();
893 let known_offsets = known_capture_table
894 .get(&slot_u32)
895 .cloned()
896 .unwrap_or_default();
897 emit_lambda_body(
898 ctx,
899 module,
900 lambda,
901 lambda_fn,
902 const_pool,
903 &helper_table,
904 &closure_fn_table,
905 &offsets,
906 &known_offsets,
907 )?;
908 }
909
910 Ok((entry_fn, shape, helper_table, closure_fn_table))
911}
912
913/// Phase F.W7 self-recursion fast path: scan every IR function body
914/// (entry + helpers + lambdas) for the canonical
915/// `Op::MakeClosure { fn_table_idx, captures } ; Op::LetSet { idx, ty:
916/// Closure }` pair and collect the captures whose `let_idx` matches the
917/// `LetSet`'s `idx` — those are the self-recursive captures stamped by
918/// `lower_closure_as_value`'s "let-slot not yet bound" branch.
919///
920/// Returns `fn_table_idx -> [(capture_offset, self_fn_table_idx)]` so
921/// the lambda body emitter can stamp the matching
922/// [`Provenance::OwnCaptureHandle`] on each capture load.
923///
924/// The scan tolerates intervening ops between `MakeClosure` and
925/// `LetSet` (none are emitted today; future lowering passes that
926/// interleave additional setup ops would still be matched). It bails
927/// silently on patterns it can't recognise — the fast path stays
928/// opt-in and the slow-path `emit_call_closure` keeps working
929/// regardless.
930fn build_self_capture_table(
931 entry: &Func,
932 helpers: &[&Func],
933 lambdas: &[&Func],
934) -> HashMap<u32, Vec<(u32, u32)>> {
935 let mut table: HashMap<u32, Vec<(u32, u32)>> = HashMap::new();
936
937 let scan = |func: &Func, table: &mut HashMap<u32, Vec<(u32, u32)>>| {
938 let ops = &func.body;
939 for (i, tagged) in ops.iter().enumerate() {
940 // Find a MakeClosure immediately followed by a matching
941 // `LetSet { ty: Closure }`. The IR lowering pass emits
942 // these adjacently (see `lower_anon_dict_body` /
943 // `lower_closure_as_value`); intervening ops break the
944 // simple match and the slow-path dispatch keeps working.
945 let Op::MakeClosure {
946 fn_table_idx,
947 ref captures,
948 ..
949 } = tagged.op
950 else {
951 continue;
952 };
953 let Some(next) = ops.get(i + 1) else {
954 continue;
955 };
956 let Op::LetSet {
957 idx,
958 ty: relon_ir::ir::IrType::Closure,
959 } = next.op
960 else {
961 continue;
962 };
963 for cap in captures {
964 if cap.let_idx == idx && matches!(cap.ty, relon_ir::ir::IrType::Closure) {
965 table
966 .entry(fn_table_idx)
967 .or_default()
968 .push((cap.offset, fn_table_idx));
969 }
970 }
971 }
972 };
973
974 scan(entry, &mut table);
975 for h in helpers {
976 scan(h, &mut table);
977 }
978 for l in lambdas {
979 scan(l, &mut table);
980 }
981 table
982}
983
984/// Devirtualisation (W18, 2026-05-30): companion to
985/// [`build_self_capture_table`] for *non-self* captures of a closure
986/// whose `fn_table_idx` is a compile-time constant.
987///
988/// Maps each lambda's `fn_table_idx` to the captures-struct offsets that
989/// hold a handle produced by a literal `Op::MakeClosure { K }` (a
990/// *known* closure), together with that `K`. The lambda-body emit uses
991/// this to stamp [`Provenance::KnownClosure`] on the matching capture
992/// load (the prologue `LocalGet(0); LoadI32AtAbsolute { offset };
993/// LetSet { Closure }`), so a `CallClosure` against the capture (e.g.
994/// the W18 predicate's `is_prime(k, 2)` call) emits a direct call
995/// instead of the runtime `switch i32 %cc_fn_idx`.
996///
997/// Soundness: within each function we track, in source order, the
998/// most-recent `MakeClosure { K }; LetSet { idx, Closure }` assignment
999/// per outer let-slot. Any *other* `LetSet { idx, Closure }` clears the
1000/// slot — so a let that is reassigned to a dynamically-chosen closure is
1001/// never recorded as known. A capture is recorded only when its
1002/// `let_idx` resolves to a still-known slot AND the captured `K` differs
1003/// from the capturing lambda `L` (a self-capture, `K == L`, is owned by
1004/// [`build_self_capture_table`], whose `captures_ptr`-reuse fast path is
1005/// strictly better). The lowering pass emits the capturing
1006/// `MakeClosure` only after the captured let is bound and reads the live
1007/// slot, so the tracked `K` is exactly the value the capture holds.
1008fn build_known_capture_table(
1009 entry: &Func,
1010 helpers: &[&Func],
1011 lambdas: &[&Func],
1012) -> HashMap<u32, Vec<(u32, u32)>> {
1013 use relon_ir::ir::IrType as Irt;
1014 let mut table: HashMap<u32, Vec<(u32, u32)>> = HashMap::new();
1015
1016 let scan = |func: &Func, table: &mut HashMap<u32, Vec<(u32, u32)>>| {
1017 let ops = &func.body;
1018 // outer let-slot -> known captured `fn_table_idx`, last-write
1019 // wins; cleared when the slot is reassigned a non-known closure.
1020 let mut known_slots: HashMap<u32, u32> = HashMap::new();
1021 for (i, tagged) in ops.iter().enumerate() {
1022 // Maintain `known_slots` off each `LetSet { idx, Closure }`:
1023 // if the immediately-preceding op is a `MakeClosure { K }`
1024 // (the canonical `MakeClosure; LetSet` binding the lowering
1025 // emits) the slot becomes a *known* closure `K`; any other
1026 // `LetSet { Closure }` stores a value we cannot prove is one
1027 // statically-known closure, so the slot is dropped. Driving
1028 // this off the `LetSet` (rather than the `MakeClosure`)
1029 // avoids the binding `LetSet` clobbering the very entry the
1030 // preceding `MakeClosure` established.
1031 if let Op::LetSet {
1032 idx,
1033 ty: Irt::Closure,
1034 } = tagged.op
1035 {
1036 if let Some(Op::MakeClosure { fn_table_idx, .. }) =
1037 i.checked_sub(1).and_then(|p| ops.get(p)).map(|t| &t.op)
1038 {
1039 known_slots.insert(idx, *fn_table_idx);
1040 } else {
1041 known_slots.remove(&idx);
1042 }
1043 continue;
1044 }
1045 // At a capturing `MakeClosure { L }`, record each capture
1046 // that reads a still-known slot. The capturing closure's own
1047 // handle need NOT be stored to a let — the W18 predicate is
1048 // passed straight into `_list_filter` — because the fact
1049 // recorded here is about lambda `L`'s captures-struct layout
1050 // (offset O holds known closure K), which is fixed by `L`'s
1051 // own `MakeClosure` captures and the known-ness of the
1052 // captured outer let, independent of where `L`'s handle goes.
1053 if let Op::MakeClosure {
1054 fn_table_idx: l_idx,
1055 ref captures,
1056 ..
1057 } = tagged.op
1058 {
1059 for cap in captures {
1060 if !matches!(cap.ty, Irt::Closure) {
1061 continue;
1062 }
1063 if let Some(&k_idx) = known_slots.get(&cap.let_idx) {
1064 // `k_idx == l_idx` is a self-capture — owned by
1065 // `build_self_capture_table`; skip here.
1066 if k_idx != l_idx {
1067 table.entry(l_idx).or_default().push((cap.offset, k_idx));
1068 }
1069 }
1070 }
1071 }
1072 }
1073 };
1074
1075 scan(entry, &mut table);
1076 for h in helpers {
1077 scan(h, &mut table);
1078 }
1079 for l in lambdas {
1080 scan(l, &mut table);
1081 }
1082 table
1083}
1084
1085/// Devirtualisation (W18) correctness helper: collect every let-slot
1086/// index that a body assigns via `Op::LetSet { ty: Closure }`, recursing
1087/// into nested `Op::If` / `Op::Block` / `Op::Loop` bodies. Used by
1088/// `emit_loop` to conservatively invalidate the `KnownClosure` let-slot
1089/// tracker for any closure slot the loop body reassigns, so a
1090/// cross-iteration read cannot devirtualise to a stale target.
1091fn collect_closure_letset_slots(body: &[TaggedOp], out: &mut Vec<u32>) {
1092 for t in body {
1093 match &t.op {
1094 Op::LetSet {
1095 idx,
1096 ty: relon_ir::ir::IrType::Closure,
1097 } => out.push(*idx),
1098 Op::If {
1099 then_body,
1100 else_body,
1101 ..
1102 } => {
1103 collect_closure_letset_slots(then_body, out);
1104 collect_closure_letset_slots(else_body, out);
1105 }
1106 Op::Block { body, .. } | Op::Loop { body, .. } => {
1107 collect_closure_letset_slots(body, out);
1108 }
1109 _ => {}
1110 }
1111 }
1112}
1113
1114/// Declare a sibling helper function's LLVM signature without emitting
1115/// its body. Used to seat every helper into the module so the entry's
1116/// `Op::Call` lowering can resolve forward references (recursion,
1117/// mutual recursion). Sibling helpers use a plain typed
1118/// `(params...) -> ret` shape — no `*state` pointer, no buffer
1119/// protocol; the test harness drives recursive Int-only functions
1120/// directly. When the IR layer grows first-class closure values
1121/// (Phase F), this signature widens to carry `(*state, captures, ...)`.
1122fn declare_helper_function<'ctx>(
1123 ctx: &'ctx Context,
1124 module: &LlvmModule<'ctx>,
1125 func: &Func,
1126 slot: usize,
1127) -> Result<FunctionValue<'ctx>, LlvmError> {
1128 let mut param_types: Vec<BasicMetadataTypeEnum<'ctx>> = Vec::with_capacity(func.params.len());
1129 for (i, p) in func.params.iter().enumerate() {
1130 let bt = ir_ty_to_llvm_abi(ctx, *p).ok_or_else(|| {
1131 LlvmError::UnsupportedSignature(format!(
1132 "llvm-aot: helper `{}` param #{i} type {p:?} unsupported",
1133 func.name
1134 ))
1135 })?;
1136 param_types.push(basic_to_metadata(bt));
1137 }
1138 let ret_bt = ir_ty_to_llvm_abi(ctx, func.ret).ok_or_else(|| {
1139 LlvmError::UnsupportedSignature(format!(
1140 "llvm-aot: helper `{}` return type {:?} unsupported",
1141 func.name, func.ret
1142 ))
1143 })?;
1144 let fn_type = match ret_bt {
1145 BasicTypeEnum::IntType(t) => t.fn_type(¶m_types, false),
1146 BasicTypeEnum::FloatType(t) => t.fn_type(¶m_types, false),
1147 BasicTypeEnum::PointerType(t) => t.fn_type(¶m_types, false),
1148 other => {
1149 return Err(LlvmError::Codegen(format!(
1150 "llvm-aot: helper `{}` ret BasicType {other:?} unsupported",
1151 func.name
1152 )));
1153 }
1154 };
1155 // Use a deterministic LLVM symbol so the entry's call site can be
1156 // pretty-printed in the IR dump. The slot keeps multiple helpers
1157 // with the same source name (shouldn't happen, but cheap) from
1158 // colliding.
1159 let _ = slot;
1160 let llvm_name = format!("relon_helper_{}", func.name);
1161 let fv = module.add_function(&llvm_name, fn_type, Some(Linkage::Internal));
1162 Ok(fv)
1163}
1164
1165/// Phase F.W7: declare a lambda function's LLVM signature without
1166/// emitting its body. Lambdas always carry the
1167/// `(state: ptr, ...lambda.params) -> ret` signature — the first IR
1168/// param is the captures_ptr the IR lowering pass prepended in
1169/// `lower_closure_as_value`, surfaced through LLVM param 1. Subsequent
1170/// LLVM params correspond to the lambda's user-visible args.
1171///
1172/// The implicit `*state` pointer at LLVM param 0 mirrors the
1173/// buffer-protocol entry's leading state slot so the lambda body's
1174/// `Op::AllocScratch{,Dyn}` / `Op::*AtAbsolute` ops can resolve
1175/// `arena_base` + scratch cursors through the same helper paths the
1176/// entry uses.
1177fn declare_lambda_function<'ctx>(
1178 ctx: &'ctx Context,
1179 module: &LlvmModule<'ctx>,
1180 func: &Func,
1181 slot: usize,
1182) -> Result<FunctionValue<'ctx>, LlvmError> {
1183 let ptr_t = ctx.ptr_type(AddressSpace::default());
1184 let mut param_types: Vec<BasicMetadataTypeEnum<'ctx>> =
1185 Vec::with_capacity(1 + func.params.len());
1186 param_types.push(ptr_t.into());
1187 for (i, p) in func.params.iter().enumerate() {
1188 let bt = ir_ty_to_llvm_abi(ctx, *p).ok_or_else(|| {
1189 LlvmError::UnsupportedSignature(format!(
1190 "llvm-aot: lambda `{}` param #{i} type {p:?} unsupported",
1191 func.name
1192 ))
1193 })?;
1194 param_types.push(basic_to_metadata(bt));
1195 }
1196 let ret_bt = ir_ty_to_llvm_abi(ctx, func.ret).ok_or_else(|| {
1197 LlvmError::UnsupportedSignature(format!(
1198 "llvm-aot: lambda `{}` return type {:?} unsupported",
1199 func.name, func.ret
1200 ))
1201 })?;
1202 let fn_type = match ret_bt {
1203 BasicTypeEnum::IntType(t) => t.fn_type(¶m_types, false),
1204 BasicTypeEnum::FloatType(t) => t.fn_type(¶m_types, false),
1205 BasicTypeEnum::PointerType(t) => t.fn_type(¶m_types, false),
1206 other => {
1207 return Err(LlvmError::Codegen(format!(
1208 "llvm-aot: lambda `{}` ret BasicType {other:?} unsupported",
1209 func.name
1210 )));
1211 }
1212 };
1213 // `relon_lambda_<slot>_<name>` so the emitted IR dump is greppable
1214 // when debugging which `fn_table_idx` mapped to which body.
1215 let llvm_name = format!("relon_lambda_{}_{}", slot, func.name);
1216 let fv = module.add_function(&llvm_name, fn_type, Some(Linkage::Internal));
1217 Ok(fv)
1218}
1219
1220/// Phase E.2: declare the `llvm.trap` intrinsic on `module` if it is
1221/// not already present. The intrinsic has signature `void @llvm.trap()`
1222/// — calling it raises a target-specific trap (a `ud2` on x86-64) that
1223/// the host's `panic` handler can catch when paired with an
1224/// `unreachable`. Cheap to call on every emit pass; we keep the lookup
1225/// idempotent so test fixtures that re-enter the emitter don't end up
1226/// with duplicate declarations.
1227fn declare_llvm_trap<'ctx>(ctx: &'ctx Context, module: &LlvmModule<'ctx>) -> FunctionValue<'ctx> {
1228 if let Some(f) = module.get_function("llvm.trap") {
1229 return f;
1230 }
1231 let void_t = ctx.void_type();
1232 let fn_ty = void_t.fn_type(&[], false);
1233 module.add_function("llvm.trap", fn_ty, None)
1234}
1235
1236/// Phase 0b: declare the `relon_llvm_call_native` host-dispatch helper
1237/// on `module` if absent. Signature mirrors the Rust helper:
1238///
1239/// ```text
1240/// i64 relon_llvm_call_native(ptr state, i32 import_idx,
1241/// ptr args_ptr, i32 arg_count)
1242/// ```
1243///
1244/// `Linkage::External` so MCJIT resolves it to the host address the
1245/// evaluator registers via `add_global_mapping` (the default resolver
1246/// can't see the static from inside the host dylib's section layout —
1247/// same constraint as the `str.contains` shim). Idempotent so repeated
1248/// emit passes don't duplicate the declaration.
1249fn declare_call_native<'ctx>(ctx: &'ctx Context, module: &LlvmModule<'ctx>) -> FunctionValue<'ctx> {
1250 if let Some(f) = module.get_function(crate::state::RELON_LLVM_CALL_NATIVE_SYMBOL) {
1251 return f;
1252 }
1253 let i64_t = ctx.i64_type();
1254 let i32_t = ctx.i32_type();
1255 let ptr_t = ctx.ptr_type(AddressSpace::default());
1256 let fn_ty = i64_t.fn_type(
1257 &[ptr_t.into(), i32_t.into(), ptr_t.into(), i32_t.into()],
1258 false,
1259 );
1260 module.add_function(
1261 crate::state::RELON_LLVM_CALL_NATIVE_SYMBOL,
1262 fn_ty,
1263 Some(Linkage::External),
1264 )
1265}
1266
1267/// Stage 1.B closed-world: declare a host `#native` fn as an external
1268/// `(i64...) -> i64` so `Op::CallNative` can emit a direct
1269/// `call @<host_symbol>`. Every scalar arg / return rides the i64 lane
1270/// (Bool / I32 zero-extend in; Unit returns `void`), matching the host
1271/// shim's `#[no_mangle] extern "C" fn(i64...) -> i64` ABI the
1272/// co-compile step links in. Idempotent: a repeated import name reuses
1273/// the existing declaration.
1274///
1275/// The lane is deliberately the same i64 width the open-world helper
1276/// decodes, so the two paths are bit-for-bit differential-comparable.
1277fn declare_host_fn_direct<'ctx>(
1278 ctx: &'ctx Context,
1279 module: &LlvmModule<'ctx>,
1280 import: &relon_ir::ir::NativeImport,
1281) -> FunctionValue<'ctx> {
1282 if let Some(f) = module.get_function(&import.name) {
1283 return f;
1284 }
1285 let i64_t = ctx.i64_type();
1286 let params: Vec<BasicMetadataTypeEnum<'ctx>> =
1287 import.param_tys.iter().map(|_| i64_t.into()).collect();
1288 let fn_ty = match import.ret_ty {
1289 IrType::Unit => ctx.void_type().fn_type(¶ms, false),
1290 _ => i64_t.fn_type(¶ms, false),
1291 };
1292 module.add_function(&import.name, fn_ty, Some(Linkage::External))
1293}
1294
1295/// #359 (W20): map an [`IrType`] to the LLVM type used in a helper /
1296/// lambda **call ABI** slot. This mirrors the operand-stack
1297/// convention where `F64` rides as its 64-bit *bit pattern* in an i64
1298/// register: `F64` maps to `i64`, not `double`. Keeping the ABI int-
1299/// only means a `CallClosure` / `Op::Call` site never has to bitcast
1300/// between the i64-bits stack representation and a native-float
1301/// argument / return slot — the value flows through verbatim. The
1302/// W20 n-body helpers (`pair_force` / `accel` return `F64`,
1303/// `pair_force` takes an `F64` mass) are the first closures with a
1304/// Float in their signature; without this they'd declare a `double`
1305/// slot that the i64-bits operand stack cannot feed.
1306fn ir_ty_to_llvm_abi<'ctx>(ctx: &'ctx Context, ty: IrType) -> Option<BasicTypeEnum<'ctx>> {
1307 match ty {
1308 IrType::I64 | IrType::F64 => Some(ctx.i64_type().into()),
1309 IrType::I32 | IrType::Bool | IrType::Unit => Some(ctx.i32_type().into()),
1310 IrType::String
1311 | IrType::ListInt
1312 | IrType::ListFloat
1313 | IrType::ListBool
1314 | IrType::ListString
1315 | IrType::ListSchema
1316 | IrType::ListList
1317 | IrType::Closure
1318 | IrType::Dict => Some(ctx.i32_type().into()),
1319 }
1320}
1321
1322fn basic_to_metadata(bt: BasicTypeEnum<'_>) -> BasicMetadataTypeEnum<'_> {
1323 match bt {
1324 BasicTypeEnum::IntType(t) => t.into(),
1325 BasicTypeEnum::FloatType(t) => t.into(),
1326 BasicTypeEnum::PointerType(t) => t.into(),
1327 BasicTypeEnum::ArrayType(t) => t.into(),
1328 BasicTypeEnum::StructType(t) => t.into(),
1329 BasicTypeEnum::VectorType(t) => t.into(),
1330 BasicTypeEnum::ScalableVectorType(t) => t.into(),
1331 }
1332}
1333
1334/// Lower a sibling helper's body against its declared LLVM
1335/// `FunctionValue`. Mirrors [`emit_legacy_entry`] but without enforcing
1336/// the legacy-i64 envelope — helpers may carry any
1337/// [`IrType`]-shaped param / return mix that `ir_ty_to_llvm_abi`
1338/// accepts.
1339fn emit_helper_body<'ctx>(
1340 ctx: &'ctx Context,
1341 module: &LlvmModule<'ctx>,
1342 func: &Func,
1343 llvm_fn: FunctionValue<'ctx>,
1344 const_pool: &ConstPool,
1345 helper_table: &HashMap<u32, FunctionValue<'ctx>>,
1346) -> Result<(), LlvmError> {
1347 let entry_bb = ctx.append_basic_block(llvm_fn, "entry");
1348 let builder = ctx.create_builder();
1349 builder.position_at_end(entry_bb);
1350
1351 let mut emit = Emit::new(
1352 ctx,
1353 &builder,
1354 module,
1355 llvm_fn,
1356 EntryShape::LegacyI64,
1357 /*arena_base_ptr=*/ None,
1358 /*state_ptr=*/ None,
1359 /*buffer_return_size=*/ 0,
1360 const_pool,
1361 );
1362 // Helper functions have no implicit state slot; `LocalGet(0)` maps
1363 // straight to LLVM param 0.
1364 emit.param_base = 0;
1365 emit.helper_table = Some(helper_table.clone());
1366 // Record the IR-declared return type so `Op::Return` knows what to
1367 // widen / truncate to when the operand stack value's width differs
1368 // from the LLVM signature's return slot.
1369 emit.helper_ret_ty = Some(func.ret);
1370 emit.llvm_trap_fn = Some(declare_llvm_trap(ctx, module));
1371 emit.let_floor = relon_ir::ir::body_let_watermark(&func.body);
1372 emit.lower_body(&func.body)?;
1373 Ok(())
1374}
1375
1376/// Phase F.W7: emit a lambda body. Mirrors [`emit_helper_body`] but:
1377///
1378/// - The first LLVM param (`*state`) is materialised into
1379/// `arena_base_ptr` + `state_ptr` so the body's
1380/// `Op::AllocScratch{,Dyn}` / `Op::*AtAbsolute` ops resolve against
1381/// the per-call arena state. Required because lambdas read captures
1382/// via `LocalGet(0); LoadI32AtAbsolute { offset }` against the
1383/// captures struct in scratch.
1384/// - `param_base = 1` so the IR's `LocalGet(idx)` skips the implicit
1385/// state slot — `LocalGet(0)` therefore reads the captures_ptr at
1386/// LLVM param 1, matching what the IR lowering pass laid out in
1387/// `lower_closure_as_value`.
1388/// - The closure table is threaded through so nested
1389/// `Op::MakeClosure` / `Op::CallClosure` ops inside the lambda body
1390/// keep resolving against the same module-wide lambda set the entry
1391/// uses.
1392#[allow(clippy::too_many_arguments)]
1393fn emit_lambda_body<'ctx>(
1394 ctx: &'ctx Context,
1395 module: &LlvmModule<'ctx>,
1396 func: &Func,
1397 llvm_fn: FunctionValue<'ctx>,
1398 const_pool: &ConstPool,
1399 helper_table: &HashMap<u32, FunctionValue<'ctx>>,
1400 closure_fn_table: &[FunctionValue<'ctx>],
1401 self_capture_offsets: &[(u32, u32)],
1402 known_capture_offsets: &[(u32, u32)],
1403) -> Result<(), LlvmError> {
1404 let entry_bb = ctx.append_basic_block(llvm_fn, "entry");
1405 let builder = ctx.create_builder();
1406 builder.position_at_end(entry_bb);
1407
1408 // Materialise `state_ptr` + `arena_base_ptr` at function entry.
1409 // Same pointer-arithmetic shape the buffer entry uses — the lambda
1410 // shares the per-call `ArenaState` layout because the host (the
1411 // entry function or another lambda) passes its own state pointer
1412 // through to the call indirect site verbatim.
1413 let i32_t = ctx.i32_type();
1414 let i64_t = ctx.i64_type();
1415 let i8_t = ctx.i8_type();
1416 let ptr_t = ctx.ptr_type(AddressSpace::default());
1417 let state_param = llvm_fn
1418 .get_nth_param(0)
1419 .ok_or_else(|| LlvmError::Codegen(format!("lambda `{}` missing state param", func.name)))?
1420 .into_pointer_value();
1421 let arena_base_gep = unsafe {
1422 builder
1423 .build_in_bounds_gep(
1424 i8_t,
1425 state_param,
1426 &[i32_t.const_int(ARENA_STATE_OFFSET_BASE as u64, false)],
1427 "lambda_arena_base_gep",
1428 )
1429 .map_err(|e| LlvmError::Codegen(format!("lambda arena_base GEP: {e}")))?
1430 };
1431 // TODO(P3-wasm32): use DataLayout pointer width instead of i64
1432 // for the arena-base word load + inttoptr below.
1433 let arena_base_load = builder
1434 .build_load(i64_t, arena_base_gep, "lambda_arena_base")
1435 .map_err(|e| LlvmError::Codegen(format!("lambda arena_base load: {e}")))?;
1436 mark_invariant_load(ctx, arena_base_load);
1437 let arena_base_int = arena_base_load.into_int_value();
1438 let arena_base_ptr = builder
1439 .build_int_to_ptr(arena_base_int, ptr_t, "lambda_arena_base_ptr")
1440 .map_err(|e| LlvmError::Codegen(format!("lambda arena_base inttoptr: {e}")))?;
1441
1442 // Stash the captures_ptr LLVM param (param 1) so the self-recursion
1443 // fast path in `emit_call_closure` can reuse it directly instead
1444 // of round-tripping through a `captures_ptr` field load on every
1445 // recursion. The lambda signature pins this to LLVM param 1 (param
1446 // 0 is `*state`) — see `declare_lambda_function`.
1447 let captures_ptr_param = llvm_fn
1448 .get_nth_param(1)
1449 .ok_or_else(|| {
1450 LlvmError::Codegen(format!("lambda `{}` missing captures_ptr param", func.name))
1451 })?
1452 .into_int_value();
1453
1454 let mut emit = Emit::new(
1455 ctx,
1456 &builder,
1457 module,
1458 llvm_fn,
1459 EntryShape::LegacyI64,
1460 Some(arena_base_ptr),
1461 Some(state_param),
1462 /*buffer_return_size=*/ 0,
1463 const_pool,
1464 );
1465 // LLVM param 0 is `*state`; the IR's params (including the
1466 // implicit captures_ptr at IR index 0) start at LLVM param 1.
1467 emit.param_base = 1;
1468 emit.helper_table = Some(helper_table.clone());
1469 emit.closure_fn_table = closure_fn_table.to_vec();
1470 // The lambda body's `Op::Return` carries the IR-declared return
1471 // type so the dispatcher knows what LLVM `ret` shape to emit.
1472 emit.helper_ret_ty = Some(func.ret);
1473 emit.llvm_trap_fn = Some(declare_llvm_trap(ctx, module));
1474 emit.self_capture_offsets = self_capture_offsets.to_vec();
1475 emit.known_capture_offsets = known_capture_offsets.to_vec();
1476 emit.captures_ptr_param = Some(captures_ptr_param);
1477 emit.let_floor = relon_ir::ir::body_let_watermark(&func.body);
1478 emit.lower_body(&func.body)?;
1479 Ok(())
1480}
1481
1482/// Phase D.1: emit a typed `(i64, i64, ...) -> i64` fast entry
1483/// alongside the buffer-protocol entry. Reuses the IR body's op
1484/// stream but rewrites every buffer-protocol `LoadField` into a
1485/// direct LLVM param read (via `profile.arg_offsets`) and every
1486/// trailing `StoreField` at the return-value offset into a `ret`
1487/// against the stashed value.
1488///
1489/// Returns `Err` when the IR contains ops outside the fast-path
1490/// envelope (string ops, sandbox traps, pointer-indirect StoreField,
1491/// stdlib calls — anything that escapes the simple Int-arithmetic
1492/// loop). The evaluator side surfaces this as "fast path unavailable;
1493/// fall back to the buffer entry" rather than a hard error so adding
1494/// more workloads doesn't risk regressing the buffer path.
1495pub(crate) fn emit_fast_entry<'ctx>(
1496 ctx: &'ctx Context,
1497 module: &LlvmModule<'ctx>,
1498 func: &Func,
1499 profile: &FastPathProfile,
1500 helper_table: &HashMap<u32, FunctionValue<'ctx>>,
1501 closure_fn_table: &[FunctionValue<'ctx>],
1502) -> Result<FunctionValue<'ctx>, LlvmError> {
1503 if !is_buffer_protocol_signature(&func.params, func.ret) {
1504 return Err(LlvmError::UnsupportedSignature(
1505 "fast-path entry requires buffer-protocol IR".into(),
1506 ));
1507 }
1508 let arity = profile.arg_offsets.len();
1509 if arity > 8 {
1510 // Cap at 8 to keep the typed dispatch table in evaluator.rs
1511 // finite. Sources with arity > 8 stay on the buffer path —
1512 // their boundary cost is amortised across more work anyway.
1513 return Err(LlvmError::UnsupportedSignature(format!(
1514 "fast-path entry: arity {arity} exceeds cap of 8"
1515 )));
1516 }
1517
1518 let i64_t = ctx.i64_type();
1519 let param_types: Vec<BasicMetadataTypeEnum<'ctx>> = (0..arity).map(|_| i64_t.into()).collect();
1520 let fn_type = i64_t.fn_type(¶m_types, false);
1521 let llvm_fn = module.add_function(ENTRY_SYMBOL_FAST, fn_type, None);
1522
1523 let entry_bb = ctx.append_basic_block(llvm_fn, "fast_entry");
1524 let builder = ctx.create_builder();
1525 builder.position_at_end(entry_bb);
1526
1527 // Reserve an alloca for the return value. The fast emitter
1528 // rewrites the trailing `StoreField` / `StoreFieldAtRecord` at
1529 // the return slot (which under buffer protocol writes the i64
1530 // result into the arena) to a store into this slot; the implicit
1531 // `Op::Return` at end-of-body loads from the slot and `ret`s it.
1532 // Placing the alloca in the entry block lets LLVM's mem2reg
1533 // promote it to SSA across the loop boundary.
1534 let ret_slot = builder
1535 .build_alloca(i64_t, "fast_ret_slot")
1536 .map_err(|e| LlvmError::Codegen(format!("fast ret_slot alloca: {e}")))?;
1537 // Initialise to 0 so any early `Op::Return` (no value path) still
1538 // produces a defined value — matches the buffer entry's
1539 // "ret root_size when no scalar stored" envelope.
1540 builder
1541 .build_store(ret_slot, i64_t.const_zero())
1542 .map_err(|e| LlvmError::Codegen(format!("fast ret_slot init: {e}")))?;
1543
1544 // The fast entry is a typed `(i64...) -> i64` shape derived from
1545 // the buffer-protocol IR after the dispatch-boundary rewrite. It
1546 // doesn't touch the const-data pool (the IR only contains scalar
1547 // arithmetic ops) so we hand it an empty pool to keep
1548 // `Emit::new` polymorphic.
1549 let empty_pool = ConstPool::default();
1550 let mut emit = Emit::new(
1551 ctx,
1552 &builder,
1553 module,
1554 llvm_fn,
1555 EntryShape::LegacyI64,
1556 /*arena_base_ptr=*/ None,
1557 /*state_ptr=*/ None,
1558 /*buffer_return_size=*/ 0,
1559 &empty_pool,
1560 );
1561 emit.fast_path = Some(FastEmit {
1562 profile: profile.clone(),
1563 ret_slot,
1564 });
1565 // LLVM param i corresponds to arg i — no implicit state slot for
1566 // the fast entry. `LocalGet` should never appear in the body
1567 // because the IR producer only emits LocalGet for the handshake
1568 // params (which the fast path doesn't pass).
1569 emit.param_base = 0;
1570 emit.llvm_trap_fn = Some(declare_llvm_trap(ctx, module));
1571 // Phase D.2: plumb the module-wide helper and closure tables so
1572 // an in-body `Op::Call` / `Op::MakeClosure` / `Op::CallClosure`
1573 // can resolve sibling functions. The fast emitter's per-op rewrites
1574 // (`MakeClosure` → virtualised closure, `CallClosure` → direct
1575 // call with null state/captures) consult these tables to pick the
1576 // matching `FunctionValue`.
1577 emit.helper_table = Some(helper_table.clone());
1578 emit.closure_fn_table = closure_fn_table.to_vec();
1579 emit.let_floor = relon_ir::ir::body_let_watermark(&func.body);
1580 emit.lower_body(&func.body)?;
1581
1582 // The buffer-protocol IR ends with `Op::Return` which the fast
1583 // emitter rewrote into a load+ret. If the body fell through
1584 // without an explicit Return (shouldn't happen for well-formed
1585 // `#main` IR, but be defensive), seal it with a load+ret.
1586 if let Some(cur) = builder.get_insert_block() {
1587 if cur.get_terminator().is_none() {
1588 let v = builder
1589 .build_load(i64_t, ret_slot, "fast_ret_load")
1590 .map_err(|e| LlvmError::Codegen(format!("fast trailing load: {e}")))?
1591 .into_int_value();
1592 builder
1593 .build_return(Some(&v))
1594 .map_err(|e| LlvmError::Codegen(format!("fast trailing ret: {e}")))?;
1595 }
1596 }
1597
1598 Ok(llvm_fn)
1599}
1600
1601// ---------------------------------------------------------------------------
1602// Legacy-i64 entry (Phase A bootstrap envelope, retained for tests)
1603// ---------------------------------------------------------------------------
1604
1605fn emit_legacy_entry_with_helpers<'ctx>(
1606 ctx: &'ctx Context,
1607 module: &LlvmModule<'ctx>,
1608 func: &Func,
1609 helper_table: &HashMap<u32, FunctionValue<'ctx>>,
1610 imports: &[relon_ir::ir::NativeImport],
1611 world_mode: WorldMode,
1612) -> Result<FunctionValue<'ctx>, LlvmError> {
1613 emit_legacy_entry_impl(ctx, module, func, Some(helper_table), imports, world_mode)
1614}
1615
1616/// Emit a Phase-A `(I64...) -> I64` function. Used by tests + the
1617/// Phase A bootstrap benchmarks that exercise the hand-built IR
1618/// fixtures directly (no buffer-protocol wrapping).
1619fn emit_legacy_entry_impl<'ctx>(
1620 ctx: &'ctx Context,
1621 module: &LlvmModule<'ctx>,
1622 func: &Func,
1623 helper_table: Option<&HashMap<u32, FunctionValue<'ctx>>>,
1624 imports: &[relon_ir::ir::NativeImport],
1625 world_mode: WorldMode,
1626) -> Result<FunctionValue<'ctx>, LlvmError> {
1627 for (i, p) in func.params.iter().enumerate() {
1628 if *p != IrType::I64 {
1629 return Err(LlvmError::UnsupportedSignature(format!(
1630 "llvm-aot: legacy-i64 envelope expects I64 param at #{i}, got {p:?}"
1631 )));
1632 }
1633 }
1634 if func.ret != IrType::I64 {
1635 return Err(LlvmError::UnsupportedSignature(format!(
1636 "llvm-aot: legacy-i64 envelope expects I64 return, got {:?}",
1637 func.ret
1638 )));
1639 }
1640
1641 let i64_t = ctx.i64_type();
1642 let param_types: Vec<BasicMetadataTypeEnum<'ctx>> =
1643 (0..func.params.len()).map(|_| i64_t.into()).collect();
1644 let fn_type = i64_t.fn_type(¶m_types, false);
1645 let llvm_fn = module.add_function(ENTRY_SYMBOL, fn_type, None);
1646
1647 let entry_bb = ctx.append_basic_block(llvm_fn, "entry");
1648 let builder = ctx.create_builder();
1649 builder.position_at_end(entry_bb);
1650
1651 // Legacy-i64 entry shape only consumes the hand-built fixtures
1652 // (helloworld_arith) which never reference ConstString — an empty
1653 // pool is enough.
1654 let empty_pool = ConstPool::default();
1655 let mut emit = Emit::new(
1656 ctx,
1657 &builder,
1658 module,
1659 llvm_fn,
1660 EntryShape::LegacyI64,
1661 None,
1662 None,
1663 /*buffer_return_size=*/ 0,
1664 &empty_pool,
1665 );
1666 // Param order under the legacy envelope: every IR LocalGet(i)
1667 // maps to llvm_fn.param(i) — no implicit state slot.
1668 emit.param_base = 0;
1669 if let Some(table) = helper_table {
1670 emit.helper_table = Some(table.clone());
1671 }
1672 emit.llvm_trap_fn = Some(declare_llvm_trap(ctx, module));
1673 // Stage 1.B: closed-world legacy entry threads the `#native` import
1674 // table + pre-declares each host fn as an `extern` so `CallNative`
1675 // emits a direct `call @<host_symbol>` (no state pointer needed).
1676 // The open-world legacy path keeps `imports` empty (the legacy
1677 // fixtures never carry a `CallNative`).
1678 emit.imports = imports;
1679 emit.world_mode = world_mode;
1680 if matches!(world_mode, WorldMode::ClosedWorld) {
1681 for import in imports {
1682 declare_host_fn_direct(ctx, module, import);
1683 }
1684 }
1685 emit.let_floor = relon_ir::ir::body_let_watermark(&func.body);
1686 emit.lower_body(&func.body)?;
1687
1688 Ok(llvm_fn)
1689}
1690
1691// ---------------------------------------------------------------------------
1692// Buffer-protocol entry (Phase B production envelope)
1693// ---------------------------------------------------------------------------
1694
1695// Retained for symmetry with `emit_legacy_entry_with_helpers`; the
1696// Phase F.W7 emit path always routes through
1697// `emit_buffer_entry_with_helpers_and_closures` so a closure-free
1698// module still gets the new entry shape (with an empty closure
1699// table). Marked `#[allow(dead_code)]` to keep the symmetric pair
1700// visible without firing the unused-function lint.
1701#[allow(dead_code)]
1702fn emit_buffer_entry_with_helpers<'ctx>(
1703 ctx: &'ctx Context,
1704 module: &LlvmModule<'ctx>,
1705 func: &Func,
1706 buffer_return_size: u32,
1707 const_pool: &ConstPool,
1708 helper_table: &HashMap<u32, FunctionValue<'ctx>>,
1709) -> Result<FunctionValue<'ctx>, LlvmError> {
1710 emit_buffer_entry_impl(
1711 ctx,
1712 module,
1713 func,
1714 buffer_return_size,
1715 const_pool,
1716 Some(helper_table),
1717 &[],
1718 &[],
1719 WorldMode::OpenWorld,
1720 crate::CodegenTarget::Native,
1721 &[],
1722 )
1723}
1724
1725/// Phase F.W7 variant: same as [`emit_buffer_entry_with_helpers`] but
1726/// also threads the closure function-pointer table into the entry's
1727/// `Emit` so the body's `Op::MakeClosure` lowering can stamp the
1728/// matching `fn_table_idx` into the closure handle.
1729#[allow(clippy::too_many_arguments)]
1730fn emit_buffer_entry_with_helpers_and_closures<'ctx, 'cp>(
1731 ctx: &'ctx Context,
1732 module: &LlvmModule<'ctx>,
1733 func: &Func,
1734 buffer_return_size: u32,
1735 const_pool: &'cp ConstPool,
1736 helper_table: &HashMap<u32, FunctionValue<'ctx>>,
1737 closure_fn_table: &[FunctionValue<'ctx>],
1738 imports: &'cp [relon_ir::ir::NativeImport],
1739 world_mode: WorldMode,
1740 target: crate::CodegenTarget,
1741 effectful_imports: &'cp [bool],
1742) -> Result<FunctionValue<'ctx>, LlvmError> {
1743 emit_buffer_entry_impl(
1744 ctx,
1745 module,
1746 func,
1747 buffer_return_size,
1748 const_pool,
1749 Some(helper_table),
1750 closure_fn_table,
1751 imports,
1752 world_mode,
1753 target,
1754 effectful_imports,
1755 )
1756}
1757
1758/// Emit the buffer-protocol entry function. The cranelift backend's
1759/// equivalent lives in `relon-codegen-cranelift::codegen::mod.rs` —
1760/// signature mirrored here so a host that holds either evaluator
1761/// can dispatch through the same `(state, in_ptr, …)` argv shape.
1762#[allow(clippy::too_many_arguments)]
1763fn emit_buffer_entry_impl<'ctx, 'cp>(
1764 ctx: &'ctx Context,
1765 module: &LlvmModule<'ctx>,
1766 func: &Func,
1767 buffer_return_size: u32,
1768 const_pool: &'cp ConstPool,
1769 helper_table: Option<&HashMap<u32, FunctionValue<'ctx>>>,
1770 closure_fn_table: &[FunctionValue<'ctx>],
1771 imports: &'cp [relon_ir::ir::NativeImport],
1772 world_mode: WorldMode,
1773 target: crate::CodegenTarget,
1774 effectful_imports: &'cp [bool],
1775) -> Result<FunctionValue<'ctx>, LlvmError> {
1776 let i32_t = ctx.i32_type();
1777 let i64_t = ctx.i64_type();
1778 let ptr_t = ctx.ptr_type(AddressSpace::default());
1779
1780 // (*state, i32 in_ptr, i32 in_len, i32 out_ptr, i32 out_cap, i64 caps) -> i32
1781 let param_types: Vec<BasicMetadataTypeEnum<'ctx>> = vec![
1782 ptr_t.into(),
1783 i32_t.into(),
1784 i32_t.into(),
1785 i32_t.into(),
1786 i32_t.into(),
1787 i64_t.into(),
1788 ];
1789 let fn_type = i32_t.fn_type(¶m_types, false);
1790 let llvm_fn = module.add_function(ENTRY_SYMBOL, fn_type, None);
1791
1792 let entry_bb = ctx.append_basic_block(llvm_fn, "entry");
1793 let builder = ctx.create_builder();
1794 builder.position_at_end(entry_bb);
1795
1796 // Resolve the per-call arena base once at function entry. The
1797 // LoadField / StoreField helpers consume this cached value so
1798 // the JIT doesn't reload `state->arena_base` on every access.
1799 let state_param = llvm_fn
1800 .get_nth_param(0)
1801 .ok_or_else(|| LlvmError::Codegen("buffer entry missing state param".into()))?
1802 .into_pointer_value();
1803
1804 // Pointer arithmetic on the state struct: GEP by ARENA_STATE_OFFSET_BASE
1805 // bytes through an i8 view, then load the `usize` arena base.
1806 // We use opaque pointers so the GEP element type only matters
1807 // for the offset calculation.
1808 let i8_t = ctx.i8_type();
1809 let arena_base_gep = unsafe {
1810 builder
1811 .build_in_bounds_gep(
1812 i8_t,
1813 state_param,
1814 &[i32_t.const_int(ARENA_STATE_OFFSET_BASE as u64, false)],
1815 "arena_base_gep",
1816 )
1817 .map_err(|e| LlvmError::Codegen(format!("arena_base GEP: {e}")))?
1818 };
1819 // `arena_base` is `usize`. On every supported host that's i64
1820 // (we only target x86_64 today; the inkwell feature set in the
1821 // Cargo.toml is `target-x86`). If we add a 32-bit host the
1822 // load type needs to follow `pointer_type` width — Phase B
1823 // assumes the workspace's only target is 64-bit.
1824 // TODO(P3-wasm32): use DataLayout pointer width instead of i64
1825 // for the arena-base word load + inttoptr below.
1826 let arena_base_load = builder
1827 .build_load(i64_t, arena_base_gep, "arena_base")
1828 .map_err(|e| LlvmError::Codegen(format!("arena_base load: {e}")))?;
1829 mark_invariant_load(ctx, arena_base_load);
1830 let arena_base_int = arena_base_load.into_int_value();
1831 let arena_base_ptr = builder
1832 .build_int_to_ptr(arena_base_int, ptr_t, "arena_base_ptr")
1833 .map_err(|e| LlvmError::Codegen(format!("arena_base inttoptr: {e}")))?;
1834
1835 // Phase E.1 prologue: init `state.tail_cursor = buffer_return_size`
1836 // so the first pointer-indirect StoreField lands past the fixed
1837 // area. Cheap (one store per call) — keeping it unconditional
1838 // avoids a body pre-scan. Bodies that never touch the tail
1839 // cursor pay the dead store; mem2reg / DSE eliminate it at -O3.
1840 let tail_init_gep = unsafe {
1841 builder
1842 .build_in_bounds_gep(
1843 i8_t,
1844 state_param,
1845 &[i32_t.const_int(u64::from(ARENA_STATE_OFFSET_TAIL_CURSOR), false)],
1846 "tail_cursor_init_gep",
1847 )
1848 .map_err(|e| LlvmError::Codegen(format!("tail_cursor init GEP: {e}")))?
1849 };
1850 let tail_init = i32_t.const_int(u64::from(buffer_return_size), false);
1851 builder
1852 .build_store(tail_init_gep, tail_init)
1853 .map_err(|e| LlvmError::Codegen(format!("tail_cursor init store: {e}")))?;
1854
1855 let mut emit = Emit::new(
1856 ctx,
1857 &builder,
1858 module,
1859 llvm_fn,
1860 EntryShape::Buffer,
1861 Some(arena_base_ptr),
1862 Some(state_param),
1863 buffer_return_size,
1864 const_pool,
1865 );
1866 // Buffer-protocol LocalGet(0..=3) reads the four i32 handshake
1867 // slots; LocalGet(4) reads the i64 `caps` slot. The state
1868 // pointer occupies slot 0 in the LLVM function — IR locals
1869 // start at +1 from there.
1870 emit.param_base = 1;
1871 if let Some(table) = helper_table {
1872 emit.helper_table = Some(table.clone());
1873 }
1874 emit.closure_fn_table = closure_fn_table.to_vec();
1875 emit.llvm_trap_fn = Some(declare_llvm_trap(ctx, module));
1876 // Phase 0b: thread the `#native` import table through so
1877 // `Op::CallNative` can validate the call shape.
1878 emit.imports = imports;
1879 emit.world_mode = world_mode;
1880 emit.target = target;
1881 emit.effectful_imports = effectful_imports;
1882 match world_mode {
1883 // Open-world (MCJIT / from_source): declare the dynamic-dispatch
1884 // helper so `Op::CallNative` emits a `call @relon_llvm_call_native`
1885 // that `add_global_mapping` later resolves to the host address.
1886 //
1887 // P3 §2.2: the wasm32 target has no MCJIT engine to patch the
1888 // helper symbol in — declaring it would leave an unresolvable
1889 // native import. The wasm path instead lowers each
1890 // `Op::CallNative` to a direct **wasm import** call
1891 // (`emit_call_native_wasi`), declaring the import lazily at the
1892 // call site, so we skip the helper declaration here.
1893 WorldMode::OpenWorld if matches!(target, crate::CodegenTarget::Wasm32) => {
1894 emit.call_native_fn = None;
1895 }
1896 WorldMode::OpenWorld => {
1897 emit.call_native_fn = Some(declare_call_native(ctx, module));
1898 }
1899 // Closed-world (Stage 1.B LTO co-compile): pre-declare every
1900 // host fn as an `extern` so `Op::CallNative` can emit a direct
1901 // `call @<host_symbol>`. The host bitcode is linked + inlined by
1902 // `crate::cocompile`. No `relon_llvm_call_native` helper exists
1903 // on this path.
1904 //
1905 // P3 §2.2 wasm closed-world: only pre-declare the **pure-compute**
1906 // host fns as direct externs (those get co-compiled + inlined).
1907 // An **effectful** host fn must NOT be inlined into the sandbox —
1908 // its `Op::CallNative` routes to `emit_call_native_wasi`, which
1909 // declares the `(import "env" …)` lazily. Pre-declaring it here as
1910 // a plain extern would still be link-resolved by the inlined-shim,
1911 // defeating the boundary, so we skip effectful imports.
1912 WorldMode::ClosedWorld => {
1913 emit.call_native_fn = None;
1914 for (idx, import) in imports.iter().enumerate() {
1915 let effectful = effectful_imports.get(idx).copied().unwrap_or(false);
1916 if !effectful {
1917 declare_host_fn_direct(ctx, module, import);
1918 }
1919 }
1920 }
1921 }
1922 emit.let_floor = relon_ir::ir::body_let_watermark(&func.body);
1923 emit.emit_step_budget_check("entry")?;
1924 emit.lower_body(&func.body)?;
1925
1926 Ok(llvm_fn)
1927}
1928
1929// ---------------------------------------------------------------------------
1930// Per-function emitter state
1931// ---------------------------------------------------------------------------
1932
1933/// Per-function emitter state. Holds the inkwell builder borrow,
1934/// the LLVM function the emit targets, the IR's operand stack, and
1935/// the alloca slots backing `LetSet` / `LetGet`.
1936///
1937/// `param_base` accounts for the entry-shape's implicit param slot:
1938/// the buffer-protocol entry has the `*state` pointer at LLVM param
1939/// 0, so `LocalGet(0)` resolves to LLVM param 1. The legacy-i64
1940/// entry has no implicit slot, so `param_base = 0`.
1941pub(crate) struct Emit<'ctx, 'b, 'cp> {
1942 pub(crate) ctx: &'ctx Context,
1943 pub(crate) builder: &'b Builder<'ctx>,
1944 pub(crate) func: FunctionValue<'ctx>,
1945 /// Phase F.1: cached module reference so per-op lowering can
1946 /// declare extern symbols (the F.1 `str.contains` host shim) on
1947 /// demand without threading the module through every helper. The
1948 /// reference is borrowed for the emit pass only; `inkwell` keeps
1949 /// `Module` and `FunctionValue` lifetimes orthogonal so a borrow
1950 /// here doesn't conflict with the surrounding `add_function`
1951 /// calls in the entry/helper emit paths.
1952 pub(crate) module: &'b LlvmModule<'ctx>,
1953 pub(crate) shape: EntryShape,
1954 /// Cached `arena_base` pointer for the buffer-protocol entry.
1955 /// `None` for the legacy entry shape — `LoadField` / `StoreField`
1956 /// reject themselves before reaching for this value.
1957 pub(crate) arena_base_ptr: Option<PointerValue<'ctx>>,
1958 /// Cached state-pointer LLVM value (param 0 of the buffer entry).
1959 /// Phase E.1 uses it to load / store the per-call tail-cursor /
1960 /// scratch-cursor / scratch-base slots. `None` outside the
1961 /// buffer-protocol entry shape.
1962 pub(crate) state_ptr: Option<PointerValue<'ctx>>,
1963 /// Operand stack mirroring the IR's virtual stack. Every value
1964 /// in flight is an LLVM integer of the matching IR type. The
1965 /// pair tags the IR type so consumers can pick the right
1966 /// signed / unsigned predicate without re-deriving it.
1967 pub(crate) stack: Vec<TypedValue<'ctx>>,
1968 /// `LetSet { idx }` alloca slots, keyed by `(idx, ty)`. Each
1969 /// idx has at most one type at a time — the IR lowering pass
1970 /// guarantees no aliasing between idx's of different types.
1971 pub(crate) let_slots: std::collections::HashMap<u32, (PointerValue<'ctx>, IrType)>,
1972 /// Static let-index floor for stdlib inline-frame windows: the
1973 /// function body's [`relon_ir::ir::body_let_watermark`], i.e. one
1974 /// past the highest let index the body (recursively) touches.
1975 /// `emit_call_stdlib` places each inline window at
1976 /// `max(declared-slots max + 1, let_floor)` so callee lets never
1977 /// collide with caller lets that are first bound *after* the
1978 /// inlined call. While a frame is active the floor is raised past
1979 /// the callee body's own watermark (and restored on frame pop) so
1980 /// nested inlines stay collision-free too.
1981 pub(crate) let_floor: u32,
1982 /// LLVM param offset corresponding to `LocalGet(0)`. See
1983 /// [`Self::lookup_param`] — `param_base + idx` is the LLVM
1984 /// param index.
1985 pub(crate) param_base: u32,
1986 /// Label stack carrying the (entry_bb, exit_bb, kind) of every
1987 /// nested [`Op::Block`] / [`Op::Loop`]. `Br { label_depth }`
1988 /// indexes from the back (depth 0 = innermost). `Block`s exit
1989 /// to their tail; `Loop`s exit to their head.
1990 pub(crate) label_stack: Vec<LabelFrame<'ctx>>,
1991 /// Monotonic counter to mint unique LLVM basic block / value
1992 /// names so the dumped IR is human-readable.
1993 pub(crate) name_seq: u32,
1994 /// Phase B: hard-coded `return_root_size` returned from a
1995 /// buffer-protocol `Op::Return`. The IR producer leaves no
1996 /// value on the operand stack for `Return` under buffer
1997 /// protocol — the trampoline reads back `bytes_written` to
1998 /// decode the output record. We hard-code this to the schema's
1999 /// `return_layout.root_size`, passed in at emit time.
2000 pub(crate) buffer_return_size: u32,
2001 /// Phase D.1: set when emitting the fast-path entry. The
2002 /// `Op::LoadField` / `Op::StoreField` / `Op::Return` lowering
2003 /// branches consult this to rewrite the buffer-protocol IR
2004 /// against the typed `(i64...) -> i64` LLVM signature.
2005 pub(crate) fast_path: Option<FastEmit<'ctx>>,
2006 /// Phase E.2 multi-function lookup: when populated, `Op::Call`
2007 /// with `fn_index >= stdlib_function_count()` resolves to the
2008 /// matching sibling `FunctionValue` and emits a direct LLVM
2009 /// `call`. The map is keyed by IR-side `funcs` index (i.e.
2010 /// `fn_index - stdlib_count`). Empty for hand-built fixtures that
2011 /// never reference user-defined functions.
2012 pub(crate) helper_table: Option<HashMap<u32, FunctionValue<'ctx>>>,
2013 /// Phase E.2: when emitting a helper body (not the entry), this
2014 /// carries the IR-declared return type so `Op::Return` can pick
2015 /// the right LLVM `ret` shape. `None` while lowering the entry
2016 /// body — the entry's return shape is dictated by `EntryShape`.
2017 pub(crate) helper_ret_ty: Option<IrType>,
2018 /// Phase E.2: cached `llvm.trap` intrinsic `FunctionValue`. The
2019 /// intrinsic is declared once per module (in
2020 /// [`emit_module_funcs`]); each `Emit` snapshots the pointer so
2021 /// per-op `Div(I64)` / `Mod(I64)` guards can call it without
2022 /// re-querying the module.
2023 pub(crate) llvm_trap_fn: Option<FunctionValue<'ctx>>,
2024 /// Phase E.1: per-module const-data lookup. `Op::ConstString { idx }`
2025 /// reads the matching offset and pushes `iconst(I32, off)`.
2026 pub(crate) const_pool: &'cp ConstPool,
2027 /// Phase E.1: stack of inline call frames. `Op::Call` pushes one
2028 /// before lowering the callee body; `Op::Return` inside the
2029 /// callee body pops the typed value into the topmost frame's
2030 /// result alloca and jumps to its exit block. The callee's
2031 /// `LocalGet(idx)` resolves to `params[idx]` rather than the
2032 /// entry's LLVM params; `LetGet/LetSet` indices are remapped
2033 /// against `let_offset` so concurrent inline frames don't clash.
2034 pub(crate) inline_frames: Vec<InlineFrame<'ctx>>,
2035 /// Phase E.1: did the body emit a pointer-indirect StoreField?
2036 /// When set, the buffer-protocol epilogue returns the post-bump
2037 /// tail cursor (in bytes past `out_ptr`) rather than the
2038 /// statically-known `buffer_return_size`. Mirrors cranelift's
2039 /// `needs_tail_cursor` flag.
2040 pub(crate) needs_tail_cursor: bool,
2041 /// In-place region-walk return ABI (S2): set by `emit_store_field`
2042 /// when the entry returns a `List<List<scalar>>` sourced directly
2043 /// from a `#main` parameter. Holds the **arena-relative** i32 offset
2044 /// of the root list header (the value `Op::LoadListListPtr` pushed,
2045 /// already rebased by `in_ptr`). No bytes are copied into `out_buf`;
2046 /// instead the buffer epilogue (`emit_return`) encodes this offset as
2047 /// the negative in-place sentinel `-(root_abs + 1)` and returns it,
2048 /// telling the host to verify + decode the value in place at its
2049 /// source region rather than at `out_ptr`. `None` for every other
2050 /// return shape, which keeps the existing `buffer_return_size` /
2051 /// tail-cursor epilogue. Mirrors cranelift's `inplace_return_root`.
2052 pub(crate) inplace_return_root: Option<IntValue<'ctx>>,
2053 /// Phase F.W7: ordered list of lambda `FunctionValue`s, indexed by
2054 /// `fn_table_idx`. `Op::MakeClosure { fn_table_idx }` stamps the
2055 /// matching index into the closure handle's `fn_table_idx` slot
2056 /// and uses the same lookup to resolve the function pointer to
2057 /// stash. `Op::CallClosure` reads the handle's `fn_table_idx`
2058 /// slot and dispatches indirectly through a private global table
2059 /// of function pointers seeded from this list. Empty when the
2060 /// module contains no lambdas.
2061 pub(crate) closure_fn_table: Vec<FunctionValue<'ctx>>,
2062 /// Phase F.W7: per-IR-`record_local_idx` allocas backing
2063 /// `Op::AllocRootRecord` / `Op::StoreFieldAtRecord`. The slot
2064 /// holds an i32 out_ptr-relative offset; `AllocRootRecord` writes
2065 /// `0` there (root sits at `out_ptr + 0`), `StoreFieldAtRecord`
2066 /// reads it back to compute the destination address. Mirrors
2067 /// cranelift's `record_locals` map.
2068 pub(crate) record_locals: std::collections::HashMap<u32, PointerValue<'ctx>>,
2069 /// Phase H: bytes literal pushed by the *immediately preceding*
2070 /// `Op::ConstString` op (i.e. still the top-of-stack at the start
2071 /// of the next `lower_op` call). Cleared at the start of every
2072 /// `lower_op` and re-populated by the `Op::ConstString` arm at
2073 /// its tail. The `Op::Call` arm reads this when `fn_index ==
2074 /// STDLIB_IDX_CONTAINS` to detect the const-needle case and
2075 /// inline a tight byte-scan loop, skipping the
2076 /// `relon_llvm_str_contains_arena` extern shim's FFI boundary
2077 /// (~10-15 cycles of prologue/epilogue per call on x86_64). On
2078 /// the W4 / W4_long hot loops the needle is always a
2079 /// compile-time const (`"x"`), so the const-needle fast path
2080 /// fires 100% of iters. Stays `None` when the needle came in via
2081 /// `LocalGet` / `LetGet` / any non-`ConstString` producer — those
2082 /// fall through to the existing extern path.
2083 pub(crate) last_const_string: Option<Vec<u8>>,
2084 /// Phase F.W7 self-recursion fast path: per-lambda map of captures
2085 /// struct offsets that hold a self-recursive closure handle, keyed
2086 /// by the `fn_table_idx` of the enclosing lambda. Populated only
2087 /// for lambda bodies (the entry / helpers leave it empty); the
2088 /// scanner in `build_self_capture_table` correlates each
2089 /// `Op::MakeClosure` in the entry with the immediately following
2090 /// `LetSet { idx, ty: Closure }` to identify captures whose
2091 /// `cap.let_idx == idx` (i.e. the binding being assigned right
2092 /// after MakeClosure — the canonical IR shape for a self-recursive
2093 /// closure-as-value let). The value `Vec<(offset,
2094 /// self_fn_table_idx)>` lets the lambda-prologue `Op::LocalGet(0);
2095 /// Op::LoadI32AtAbsolute { offset }` chain stamp the matching
2096 /// [`Provenance::OwnCaptureHandle`] on the produced handle so the
2097 /// downstream `Op::CallClosure` can pick the direct-call fast path
2098 /// (skip handle deref, skip switch, reuse the lambda's own
2099 /// captures_ptr LLVM param 1). Empty when the lambda has no
2100 /// self-recursive captures or when self-recursion detection is
2101 /// unavailable (legacy / fixture entries that bypass the
2102 /// MakeClosure → LetSet pattern).
2103 pub(crate) self_capture_offsets: Vec<(u32, u32)>,
2104 /// Phase F.W7 self-recursion fast path: let-slot indices that hold
2105 /// a self-recursive closure handle along with the enclosing
2106 /// lambda's `fn_table_idx`. Populated by `Op::LetSet` when the
2107 /// stored value carries [`Provenance::OwnCaptureHandle`] so the
2108 /// matching `Op::LetGet` can re-emit the provenance — this is what
2109 /// lets the recursive `fib(k - 1)` call site (which always goes
2110 /// through `LetGet`) keep the self-recursion fast path intact.
2111 pub(crate) self_capture_let_slots: std::collections::HashMap<u32, (u32, u32)>,
2112 /// Phase F.W7 self-recursion fast path: captures_ptr LLVM param
2113 /// (param 1) of the enclosing lambda. Cached so the closure-call
2114 /// emitter can pass it straight into the recursive call without
2115 /// re-loading from the closure handle. `None` when emitting the
2116 /// entry / a helper (not a lambda body) — the self-recursion fast
2117 /// path is gated on this being `Some`.
2118 pub(crate) captures_ptr_param: Option<IntValue<'ctx>>,
2119 /// Phase D.2 fast-path entry: let-slot indices holding a
2120 /// virtualised closure stamped by an in-body `Op::MakeClosure`
2121 /// (carries `Provenance::FastPathClosure`). The `LetSet` that
2122 /// catches such a value stashes the `fn_table_idx` here so the
2123 /// matching `LetGet` can re-emit the provenance, keeping the
2124 /// `CallClosure` direct-call rewrite alive across the let chain.
2125 /// Empty when not emitting the fast-path entry.
2126 pub(crate) fast_path_closure_let_slots: std::collections::HashMap<u32, u32>,
2127 /// Phase L W3: let-slot indices holding a `Provenance::ConstString`
2128 /// value (i.e. the let was set from a value sourced — directly or
2129 /// via prior `LetGet` chains — from an `Op::ConstString`). The
2130 /// matching `LetGet` re-stamps the provenance so the downstream
2131 /// `Op::Add(String)` lowering can switch to the const-len /
2132 /// single-byte-store fast path. Each entry records (len, optional
2133 /// first_byte). Empty by default; entries survive only across
2134 /// inner-loop iterations because the W3 reduce shape's `s` let is
2135 /// re-set every iteration from the same const literal.
2136 pub(crate) const_string_let_slots: std::collections::HashMap<u32, (u32, Option<u8>)>,
2137 /// Devirtualisation (W18): let-slot indices holding a real
2138 /// arena-resident closure handle whose `fn_table_idx` is a
2139 /// compile-time constant (`Provenance::KnownClosure`). The `LetSet`
2140 /// that catches such a value stashes the `fn_table_idx` here so the
2141 /// matching `LetGet` re-stamps the provenance, letting the downstream
2142 /// `CallClosure` emit a direct call (LLVM inlines it) instead of the
2143 /// runtime `switch i32 %cc_fn_idx`. A non-known-closure `LetSet`
2144 /// against the same slot wipes the entry so a later `LetGet` cannot
2145 /// fraudulently claim a static target. Empty by default.
2146 pub(crate) known_closure_let_slots: std::collections::HashMap<u32, u32>,
2147 /// Devirtualisation (W18): `(capture_offset, captured_fn_table_idx)`
2148 /// pairs for the lambda body currently being emitted, identifying
2149 /// captures-struct offsets that hold a handle produced by a literal
2150 /// `MakeClosure` with a compile-time-constant `fn_table_idx` (a
2151 /// *known* closure that is NOT a self-capture). The capture-load
2152 /// prologue (`LocalGet(0); LoadI32AtAbsolute { offset }`) stamps
2153 /// [`Provenance::KnownClosure`] on the matching load so a body
2154 /// `CallClosure` against the capture emits a direct call. Seeded by
2155 /// [`build_known_capture_table`]; empty when emitting the entry /
2156 /// helpers or a lambda with no such captures.
2157 pub(crate) known_capture_offsets: Vec<(u32, u32)>,
2158 /// Phase 0b native dispatch: the module's `#native` imports, in
2159 /// `import_idx` order. `Op::CallNative` validates the call's
2160 /// `import_idx` / param-shape / ret-ty against this table before
2161 /// emitting the dispatch (mirrors cranelift's `self.ir.imports`
2162 /// check). Empty for hand-built fixtures / fast / helper / lambda
2163 /// emits — those never carry a `CallNative`, so the validation arm
2164 /// surfaces a precise `Codegen` error if one slips through.
2165 pub(crate) imports: &'cp [relon_ir::ir::NativeImport],
2166 /// Phase 0b native dispatch: the declared `relon_llvm_call_native`
2167 /// helper `FunctionValue`. `Op::CallNative` emits a `call` against
2168 /// it. `None` outside the buffer-protocol entry (the only shape
2169 /// that carries a `*state` pointer to thread through).
2170 pub(crate) call_native_fn: Option<FunctionValue<'ctx>>,
2171 /// Stage 1.B: open-world (dynamic helper) vs closed-world (static
2172 /// direct `call @<host_symbol>`) native dispatch. Defaults to
2173 /// [`WorldMode::OpenWorld`] so MCJIT / `from_source` are untouched;
2174 /// only `crate::cocompile` flips it to `ClosedWorld`.
2175 pub(crate) world_mode: WorldMode,
2176 /// P3 §2.2: the codegen target. Defaults to
2177 /// [`CodegenTarget::Native`]; only the wasm32 object-emit path flips
2178 /// it to [`CodegenTarget::Wasm32`]. On wasm32 an open-world
2179 /// `Op::CallNative` lowers to a **wasm import** call (see
2180 /// [`crate::wasi_host`]) instead of the native MCJIT
2181 /// `relon_llvm_call_native` helper, which the sandbox cannot reach.
2182 pub(crate) target: crate::CodegenTarget,
2183 /// P3 §2.2 wasm closed-world routing: per-`import_idx` effectful flag.
2184 /// `effectful_imports[i] == true` means the host fn at import index
2185 /// `i` is capability-gated (a preceding `Op::CheckCap` guards its
2186 /// call) — an *effectful* fn that must cross the sandbox boundary as a
2187 /// **WASI import**, not be inlined into the wasm unit. `false` (or an
2188 /// out-of-range index on the legacy / native paths) means pure-compute:
2189 /// co-compile + inline. Empty slice on every path except wasm32
2190 /// closed-world; the wasm closed-world emit
2191 /// (`emit_module_funcs_closed_world_wasm`) populates it from the IR's
2192 /// CheckCap → CallNative shape.
2193 pub(crate) effectful_imports: &'cp [bool],
2194}
2195
2196/// Phase E.1: per-call inline-frame state. One entry per active
2197/// stdlib `Op::Call`; the callee body lowers against the topmost
2198/// frame.
2199pub(crate) struct InlineFrame<'ctx> {
2200 /// LLVM values bound to the callee's `LocalGet(0..arity)` reads.
2201 /// Order matches the IR's declared parameter order — the
2202 /// `Op::Call` site popped them from the caller's operand stack
2203 /// (top-of-stack = last param) and reversed.
2204 pub(crate) params: Vec<TypedValue<'ctx>>,
2205 /// Offset added to the callee's `LetGet/LetSet` indices so its
2206 /// let-bindings don't alias the caller's slots. Mirrors the
2207 /// cranelift backend's `let_offset`.
2208 pub(crate) let_offset: u32,
2209 /// Result alloca + exit basic block. The callee's `Op::Return`
2210 /// stores the popped value into the alloca and unconditionally
2211 /// branches to `exit_bb`; the caller continues from there with a
2212 /// matching load.
2213 pub(crate) ret_slot: PointerValue<'ctx>,
2214 /// LLVM type stored at [`Self::ret_slot`]. Pre-computed from the
2215 /// IR-declared `ret_ty` of the stdlib call so the caller-side
2216 /// load knows what width to read.
2217 pub(crate) ret_ty: IrType,
2218 /// Branch target for `Op::Return` inside the callee body. The
2219 /// caller positions the builder here after the inline finishes
2220 /// and pushes the loaded return value back onto the operand
2221 /// stack.
2222 pub(crate) exit_bb: inkwell::basic_block::BasicBlock<'ctx>,
2223}
2224
2225/// Phase D.1 fast-path emission state. Carried inside [`Emit`] when
2226/// lowering the typed fast entry.
2227#[derive(Clone)]
2228pub(crate) struct FastEmit<'ctx> {
2229 pub(crate) profile: FastPathProfile,
2230 /// Alloca holding the i64 return value. Trailing `StoreField`
2231 /// at `profile.ret_offset` writes into this slot; `Op::Return`
2232 /// loads from it.
2233 pub(crate) ret_slot: PointerValue<'ctx>,
2234}
2235
2236#[derive(Clone, Copy)]
2237pub(crate) struct TypedValue<'ctx> {
2238 pub(crate) val: IntValue<'ctx>,
2239 /// IR-level tag of `val`. Recorded so Phase C predicates that
2240 /// inspect operand types (signed-vs-unsigned cmp, F64 routing)
2241 /// have it on hand without re-deriving from LLVM bit width.
2242 /// Phase B never consumes this field; `#[allow(dead_code)]`
2243 /// keeps the lint clean while we're still wiring future Op
2244 /// support.
2245 #[allow(dead_code)]
2246 pub(crate) ty: IrType,
2247 /// Provenance hint used by [`Emit::emit_call_closure`] to detect
2248 /// self-recursive closure calls. Defaults to [`Provenance::None`]
2249 /// for every push that doesn't go through the lambda-prologue
2250 /// capture path; the closure-self-call fast path only fires when
2251 /// the consumed handle's provenance points at one of the lambda's
2252 /// own self-capture offsets.
2253 pub(crate) prov: Provenance,
2254}
2255
2256/// Tracks where an [`IntValue`] on the operand stack came from so the
2257/// closure-call emitter can detect self-recursion without re-loading
2258/// the handle's captures pointer through arena indirection.
2259///
2260/// The W7 production source's `fib` closure captures itself, so every
2261/// recursive `fib(k - 1)` call site walks
2262/// `captures_ptr -> self_handle -> captures_ptr_field -> direct call`.
2263/// LLVM cannot fold the `captures_ptr_field` load back to the input
2264/// `captures_ptr` because the chain crosses `MakeClosure` in another
2265/// function (no IPA reach), so a pure post-O3 IR ends up with three
2266/// arena loads per recursion (`~10 ns/call ≈ +170 µs` over `fib(22)`).
2267///
2268/// The provenance bits below are enough to short-circuit:
2269///
2270/// * `OwnCapturesPtr` — the value is the lambda's own captures_ptr arg
2271/// (LLVM param 1). Produced by `Op::LocalGet(0)` inside a lambda.
2272/// * `OwnCaptureHandle { offset, self_fn_table_idx }` — the value is a
2273/// closure handle loaded from `captures_ptr + offset` and the
2274/// matching `MakeClosure` capture is self-recursive (handle points
2275/// back at the enclosing lambda whose `fn_table_idx ==
2276/// self_fn_table_idx`). Lets `Op::CallClosure` emit a direct call to
2277/// `closure_fn_table[self_fn_table_idx]` with the current
2278/// `captures_ptr` arg — no handle deref, no switch, no trap branch.
2279#[derive(Clone, Copy, Debug, PartialEq, Eq)]
2280pub(crate) enum Provenance {
2281 None,
2282 /// LLVM param 1 of the enclosing lambda — the captures_ptr arg.
2283 OwnCapturesPtr,
2284 /// Closure handle loaded from `captures_ptr + offset`; the matching
2285 /// MakeClosure capture is self-recursive, so the handle's
2286 /// `captures_ptr` field equals `OwnCapturesPtr` and the handle's
2287 /// `fn_table_idx` equals `self_fn_table_idx`.
2288 OwnCaptureHandle {
2289 #[allow(dead_code)]
2290 offset: u32,
2291 self_fn_table_idx: u32,
2292 },
2293 /// Phase D.2: closure handle materialised by a `MakeClosure` op
2294 /// inside the fast-path entry. The fast entry has no arena/state,
2295 /// so `MakeClosure` cannot bump-allocate the 8-byte handle record;
2296 /// instead the value is virtualised — we remember the
2297 /// `fn_table_idx` and rewrite the matching `CallClosure` into a
2298 /// direct call against the lambda function. The lambda's
2299 /// `(state, captures_ptr, args...)` signature is satisfied by
2300 /// passing null / zero for state / captures, which is sound for
2301 /// W7-style self-recursive closures whose post-O3 body drops
2302 /// both args.
2303 FastPathClosure {
2304 fn_table_idx: u32,
2305 },
2306 /// Devirtualisation (W18, 2026-05-30): the IntValue is a *real*
2307 /// arena-resident closure handle (`[fn_table_idx][captures_ptr]`)
2308 /// produced by a literal [`Op::MakeClosure`] whose `fn_table_idx` is
2309 /// a compile-time constant. Unlike [`Self::FastPathClosure`] the
2310 /// handle is fully materialised in the arena (the buffer-protocol
2311 /// entry has state + arena), so the matching `CallClosure` still
2312 /// loads the real `captures_ptr` from `handle + 4` — it only skips
2313 /// the runtime `switch i32 %cc_fn_idx` over `handle + 0`, because the
2314 /// handle's `fn_table_idx` word is *provably* this constant.
2315 ///
2316 /// Soundness: the value flows unmodified from the `MakeClosure` (or a
2317 /// `LetSet`/`LetGet` round-trip, or an inline-frame argument bind)
2318 /// to the `CallClosure`; there is exactly one possible callee, so the
2319 /// switch's runtime selection is statically decided. The slow-path
2320 /// `build_switch` stays for any handle that did *not* arrive with
2321 /// this provenance (a genuinely-dynamic dispatch). When the W18
2322 /// `_list_filter` predicate (a literal `(k) => is_prime(k, 2)`
2323 /// MakeClosure) is inlined into the bundled `list_int_filter` body,
2324 /// this lets the per-element predicate dispatch become a direct call
2325 /// LLVM then inlines, killing the hot-loop switch.
2326 KnownClosure {
2327 fn_table_idx: u32,
2328 },
2329 /// Phase L W3 (2026-05-28): the IntValue is an i32 arena offset to a
2330 /// `[len:u32 LE][payload]` String record whose payload was placed in
2331 /// the const-pool prefix at module build time, so its length is
2332 /// known at compile time. Carried by `Op::ConstString` and
2333 /// propagated through `Op::LetSet { ty: String }` →
2334 /// `Op::LetGet { ty: String }` so `Op::Add(String)` can feed the
2335 /// const length to LLVM (memcpy intrinsic with const size lowers
2336 /// to inline stores) and skip the per-iter `[len]` header reload.
2337 ///
2338 /// Single-byte payloads (the W3 reduce hot loop's `"a"`) further
2339 /// expose `first_byte` so the in-place fast path can emit a single
2340 /// `i8 store` instead of `memcpy` — bypassing the LLVM lowering
2341 /// pass altogether for the dominant reduce shape.
2342 ConstString {
2343 len: u32,
2344 /// `Some(byte)` when `len == 1` so the lowering can emit an
2345 /// inline `store i8 byte, dst` instead of a memcpy intrinsic.
2346 /// `None` for longer payloads (LLVM's memcpy intrinsic
2347 /// lowering still handles those well once the size is const).
2348 first_byte: Option<u8>,
2349 },
2350}
2351
2352#[derive(Clone, Copy, Debug, PartialEq, Eq)]
2353pub(crate) enum LabelKind {
2354 /// `Br` jumps **past** the block (forward exit).
2355 Block,
2356 /// `Br` jumps **back** to the loop header (continue).
2357 Loop,
2358}
2359
2360#[derive(Clone, Copy)]
2361pub(crate) struct LabelFrame<'ctx> {
2362 /// Header basic block. For Block this is unused for branching
2363 /// (we never branch backward to the start of a block); for Loop
2364 /// it's the target of a `Br` (continue).
2365 pub(crate) header_bb: inkwell::basic_block::BasicBlock<'ctx>,
2366 /// Tail basic block — what code after the block / after the
2367 /// loop falls through to. For Block this is the `Br` target;
2368 /// for Loop the surrounding code lives here.
2369 pub(crate) tail_bb: inkwell::basic_block::BasicBlock<'ctx>,
2370 pub(crate) kind: LabelKind,
2371}
2372
2373impl<'ctx, 'b, 'cp> Emit<'ctx, 'b, 'cp> {
2374 #[allow(clippy::too_many_arguments)]
2375 pub(crate) fn new(
2376 ctx: &'ctx Context,
2377 builder: &'b Builder<'ctx>,
2378 module: &'b LlvmModule<'ctx>,
2379 func: FunctionValue<'ctx>,
2380 shape: EntryShape,
2381 arena_base_ptr: Option<PointerValue<'ctx>>,
2382 state_ptr: Option<PointerValue<'ctx>>,
2383 buffer_return_size: u32,
2384 const_pool: &'cp ConstPool,
2385 ) -> Self {
2386 Self {
2387 ctx,
2388 builder,
2389 func,
2390 module,
2391 shape,
2392 arena_base_ptr,
2393 state_ptr,
2394 stack: Vec::with_capacity(8),
2395 let_slots: std::collections::HashMap::new(),
2396 let_floor: 0,
2397 param_base: 0,
2398 label_stack: Vec::new(),
2399 name_seq: 0,
2400 buffer_return_size,
2401 fast_path: None,
2402 helper_table: None,
2403 helper_ret_ty: None,
2404 llvm_trap_fn: None,
2405 const_pool,
2406 inline_frames: Vec::new(),
2407 needs_tail_cursor: false,
2408 inplace_return_root: None,
2409 last_const_string: None,
2410 closure_fn_table: Vec::new(),
2411 record_locals: std::collections::HashMap::new(),
2412 self_capture_offsets: Vec::new(),
2413 self_capture_let_slots: std::collections::HashMap::new(),
2414 captures_ptr_param: None,
2415 fast_path_closure_let_slots: std::collections::HashMap::new(),
2416 const_string_let_slots: std::collections::HashMap::new(),
2417 known_closure_let_slots: std::collections::HashMap::new(),
2418 known_capture_offsets: Vec::new(),
2419 imports: &[],
2420 call_native_fn: None,
2421 world_mode: WorldMode::OpenWorld,
2422 target: crate::CodegenTarget::Native,
2423 effectful_imports: &[],
2424 }
2425 }
2426
2427 pub(crate) fn next_name(&mut self, hint: &str) -> String {
2428 self.name_seq += 1;
2429 format!("{hint}_{}", self.name_seq)
2430 }
2431
2432 // -- stack helpers --------------------------------------------------
2433
2434 pub(crate) fn push(&mut self, v: IntValue<'ctx>, ty: IrType) {
2435 self.stack.push(TypedValue {
2436 val: v,
2437 ty,
2438 prov: Provenance::None,
2439 });
2440 }
2441
2442 /// Push a value while attaching a [`Provenance`] tag. Currently
2443 /// only emitted by the lambda-prologue capture path
2444 /// (`LocalGet(0)` → `LoadI32AtAbsolute` → `LetSet/LetGet`) so
2445 /// `emit_call_closure` can short-circuit self-recursive calls.
2446 pub(crate) fn push_with_prov(&mut self, v: IntValue<'ctx>, ty: IrType, prov: Provenance) {
2447 self.stack.push(TypedValue { val: v, ty, prov });
2448 }
2449
2450 /// Phase F.W7 self-recursion fast path: peek the operand stack's
2451 /// top-of-stack provenance without consuming it and return the
2452 /// matching [`Provenance::OwnCaptureHandle`] when the top is the
2453 /// lambda's captures_ptr and `offset` matches a recorded self-
2454 /// recursive capture offset. Returns `None` otherwise — the
2455 /// caller then leaves the produced value's provenance at
2456 /// [`Provenance::None`] and the closure-call emitter falls back
2457 /// to the slow-path switch dispatch.
2458 ///
2459 /// Caller uses this **after** `emit_load_at_absolute` pops the
2460 /// base; we read the stack top here before that pop runs, so
2461 /// the lookup remains correct (the base is still on top when
2462 /// the dispatcher arm fires).
2463 pub(crate) fn peek_self_capture_provenance(&self, offset: u32) -> Option<Provenance> {
2464 let top = self.stack.last()?;
2465 if !matches!(top.prov, Provenance::OwnCapturesPtr) {
2466 return None;
2467 }
2468 // Self-recursive capture wins (its `captures_ptr`-reuse direct
2469 // path is strictly cheaper than re-loading the handle's
2470 // captures_ptr field).
2471 for (cap_offset, self_fn_table_idx) in &self.self_capture_offsets {
2472 if *cap_offset == offset {
2473 return Some(Provenance::OwnCaptureHandle {
2474 offset,
2475 self_fn_table_idx: *self_fn_table_idx,
2476 });
2477 }
2478 }
2479 // Devirtualisation (W18): a capture of a known (non-self)
2480 // closure. Stamp `KnownClosure` so the body's `CallClosure`
2481 // against the capture emits a direct call (still loading the
2482 // capture's own captures_ptr) instead of the runtime switch.
2483 for (cap_offset, captured_fn_table_idx) in &self.known_capture_offsets {
2484 if *cap_offset == offset {
2485 return Some(Provenance::KnownClosure {
2486 fn_table_idx: *captured_fn_table_idx,
2487 });
2488 }
2489 }
2490 None
2491 }
2492
2493 pub(crate) fn pop(&mut self, ip_hint: &str) -> Result<TypedValue<'ctx>, LlvmError> {
2494 self.stack.pop().ok_or_else(|| {
2495 LlvmError::Codegen(format!(
2496 "operand stack underflow at {ip_hint}: producer emitted an Op with no matching push"
2497 ))
2498 })
2499 }
2500
2501 pub(crate) fn pop_int(&mut self, ip_hint: &str) -> Result<IntValue<'ctx>, LlvmError> {
2502 self.pop(ip_hint).map(|tv| tv.val)
2503 }
2504
2505 // -- locals / lets --------------------------------------------------
2506
2507 pub(crate) fn lookup_param(&self, idx: u32) -> Result<IntValue<'ctx>, LlvmError> {
2508 let llvm_idx = self
2509 .param_base
2510 .checked_add(idx)
2511 .ok_or_else(|| LlvmError::Codegen(format!("LocalGet({idx}): param idx overflow")))?;
2512 let p = self.func.get_nth_param(llvm_idx).ok_or_else(|| {
2513 LlvmError::Codegen(format!(
2514 "LocalGet({idx}) -> llvm param #{llvm_idx} out of range; function has {} param(s)",
2515 self.func.count_params()
2516 ))
2517 })?;
2518 match p {
2519 BasicValueEnum::IntValue(v) => Ok(v),
2520 other => Err(LlvmError::Codegen(format!(
2521 "LocalGet({idx}) llvm param #{llvm_idx} is {other:?}, expected IntValue"
2522 ))),
2523 }
2524 }
2525
2526 pub(crate) fn ensure_let_slot(
2527 &mut self,
2528 idx: u32,
2529 ty: IrType,
2530 ) -> Result<PointerValue<'ctx>, LlvmError> {
2531 if let Some((ptr, existing_ty)) = self.let_slots.get(&idx) {
2532 if *existing_ty != ty {
2533 return Err(LlvmError::Codegen(format!(
2534 "let-slot {idx} aliased: previous type {existing_ty:?}, new type {ty:?}"
2535 )));
2536 }
2537 return Ok(*ptr);
2538 }
2539 // Allocate in the function's entry block so the alloca is
2540 // hoisted out of any loop body. inkwell's `build_alloca`
2541 // emits at the current position, so we temporarily reposition.
2542 let entry_bb = self.func.get_first_basic_block().ok_or_else(|| {
2543 LlvmError::Codegen("ensure_let_slot: function has no entry block".into())
2544 })?;
2545 let cur = self.builder.get_insert_block();
2546 // Position at the start of the entry block so allocas group
2547 // at the top — LLVM mem2reg requires this canonical layout
2548 // to promote slots into SSA.
2549 if let Some(first_instr) = entry_bb.get_first_instruction() {
2550 self.builder.position_before(&first_instr);
2551 } else {
2552 self.builder.position_at_end(entry_bb);
2553 }
2554 let llvm_ty: inkwell::types::BasicTypeEnum<'ctx> = match ty {
2555 // AOT-1: F64 rides as i64 bits on the virtual stack, so its
2556 // let-slot is the same 64-bit-wide integer alloca as I64.
2557 // The `(idx, ty)` aliasing key keeps an I64 and an F64 slot
2558 // for the same index distinct, so the bit pattern never gets
2559 // reinterpreted across types.
2560 IrType::I64 | IrType::F64 => self.ctx.i64_type().into(),
2561 // Phase E.1: String / List* arena offsets ride on an i32
2562 // slot — matches the cranelift backend's pointer-as-i32
2563 // wire representation.
2564 //
2565 // Phase F.W7: `Closure` joins the i32-wide variants
2566 // (closure handle is an arena-relative i32 pointer at
2567 // the IR / cranelift / LLVM boundary alike).
2568 IrType::I32
2569 | IrType::Bool
2570 | IrType::Unit
2571 | IrType::String
2572 | IrType::ListInt
2573 | IrType::ListFloat
2574 | IrType::ListBool
2575 | IrType::ListString
2576 | IrType::ListSchema
2577 | IrType::ListList
2578 | IrType::Closure
2579 | IrType::Dict => self.ctx.i32_type().into(),
2580 };
2581 let name = format!("let_{idx}");
2582 let ptr = self
2583 .builder
2584 .build_alloca(llvm_ty, &name)
2585 .map_err(|e| LlvmError::Codegen(format!("let-slot {idx} alloca: {e}")))?;
2586 if let Some(bb) = cur {
2587 self.builder.position_at_end(bb);
2588 }
2589 self.let_slots.insert(idx, (ptr, ty));
2590 Ok(ptr)
2591 }
2592
2593 // -- entry point ----------------------------------------------------
2594
2595 pub(crate) fn lower_body(&mut self, body: &[TaggedOp]) -> Result<(), LlvmError> {
2596 for (ip, tagged) in body.iter().enumerate() {
2597 self.lower_op(ip, tagged)?;
2598 }
2599 // After `Op::Return` we positioned at a fresh "after_return_cont"
2600 // block which is dead and unterminated. Seal it with
2601 // `unreachable` so LLVM's verifier accepts the module. Same
2602 // pattern applies to the post-`Br` continuation block.
2603 if let Some(cur) = self.builder.get_insert_block() {
2604 if cur.get_terminator().is_none() {
2605 self.builder
2606 .build_unreachable()
2607 .map_err(|e| LlvmError::Codegen(format!("trailing unreachable: {e}")))?;
2608 }
2609 }
2610 Ok(())
2611 }
2612
2613 // -- per-op lowering ------------------------------------------------
2614
2615 pub(crate) fn lower_op(&mut self, ip: usize, tagged: &TaggedOp) -> Result<(), LlvmError> {
2616 let ip_hint = format!("ip={ip} op={:?}", tagged.op);
2617 // Phase H const-needle fast path: capture (and clear) the
2618 // `Op::ConstString` peek-state at the very start of every
2619 // `lower_op` dispatch. The `Op::Call` arm consults `prev_const_string`
2620 // to decide between the inline byte-scan and the extern shim.
2621 // Every other arm leaves `self.last_const_string` at `None` —
2622 // the only re-populator is the `Op::ConstString` arm at its
2623 // tail. Result: `prev_const_string.is_some()` iff the prior
2624 // emitted op was `Op::ConstString` and its value is still the
2625 // top-of-stack (no intervening op consumed it).
2626 let prev_const_string = self.last_const_string.take();
2627 match &tagged.op {
2628 // ---- literals ----
2629 Op::ConstI64(v) => {
2630 let c = self.ctx.i64_type().const_int(*v as u64, true);
2631 self.push(c, IrType::I64);
2632 }
2633 Op::ConstI32(v) => {
2634 let c = self.ctx.i32_type().const_int(*v as u32 as u64, false);
2635 self.push(c, IrType::I32);
2636 }
2637 Op::ConstBool(b) => {
2638 // Bool occupies an i32 slot on the IR's virtual stack.
2639 let c = self.ctx.i32_type().const_int(u64::from(*b), false);
2640 self.push(c, IrType::Bool);
2641 }
2642 Op::ConstF64(v) => {
2643 // AOT-1: materialise the `double` literal then bit-cast
2644 // to i64 so the operand stack stays integer-typed
2645 // (Option B). `v` is an `OrderedFloat<f64>`.
2646 let f = self.ctx.f64_type().const_float(v.into_inner());
2647 let bits = self
2648 .builder
2649 .build_bit_cast(f, self.ctx.i64_type(), &self.next_name("constf64_bits"))
2650 .map_err(|e| LlvmError::Codegen(format!("ConstF64 bitcast: {e}")))?
2651 .into_int_value();
2652 self.push(bits, IrType::F64);
2653 }
2654
2655 // ---- locals / lets ----
2656 Op::LocalGet(idx) => {
2657 // Phase E.1: an active inline frame redirects
2658 // `LocalGet(i)` to the inlined call's `i`-th argument
2659 // instead of the entry-function's LLVM params.
2660 if let Some(frame) = self.inline_frames.last() {
2661 let i = *idx as usize;
2662 let tv = frame.params.get(i).ok_or_else(|| {
2663 LlvmError::Codegen(format!(
2664 "inline LocalGet({idx}) out of range — callee has {} params",
2665 frame.params.len()
2666 ))
2667 })?;
2668 // Preserve provenance across the inline-frame argument
2669 // bind. The bundled `list_int_filter` body reads its
2670 // closure parameter via `LocalGet(1)`; when the caller
2671 // passed a literal `MakeClosure` (a `KnownClosure`
2672 // handle), forwarding that provenance lets the body's
2673 // per-element `CallClosure` devirtualise into a direct
2674 // call. Only `KnownClosure` is propagated here — the
2675 // self-recursion / fast-path-entry tags depend on the
2676 // current function's `captures_ptr_param` / fast-path
2677 // state, which a *callee* inline frame does not share,
2678 // so forwarding those would be unsound.
2679 let (val, prov) = (tv.val, tv.prov);
2680 match prov {
2681 Provenance::KnownClosure { .. } => {
2682 self.push_with_prov(val, tv.ty, prov);
2683 }
2684 _ => self.push(val, tv.ty),
2685 }
2686 } else {
2687 let p = self.lookup_param(*idx)?;
2688 // The legacy envelope walks all-i64; the buffer envelope
2689 // walks (i32 ×4, i64). The IR has the right type on
2690 // the param descriptor, but we don't carry it through
2691 // LocalGet — re-derive from the LLVM param width.
2692 let width = p.get_type().get_bit_width();
2693 let ty = if width == 32 {
2694 IrType::I32
2695 } else {
2696 IrType::I64
2697 };
2698 // Phase F.W7 self-recursion fast path: tag
2699 // `LocalGet(0)` inside a lambda body with
2700 // [`Provenance::OwnCapturesPtr`] so the prologue
2701 // capture-load chain can stamp
2702 // [`Provenance::OwnCaptureHandle`] on self-
2703 // recursive handles. Only fires inside a lambda
2704 // (param_base == 1 means the LLVM param 0 is
2705 // `*state` and param 1 is the captures_ptr arg);
2706 // the entry / helpers leave provenance at
2707 // `None`.
2708 if *idx == 0 && self.captures_ptr_param.is_some() {
2709 self.push_with_prov(p, ty, Provenance::OwnCapturesPtr);
2710 } else {
2711 self.push(p, ty);
2712 }
2713 }
2714 }
2715 Op::LetSet { idx, ty } => {
2716 let v = self.pop(&ip_hint)?;
2717 let mapped = self.remap_let_idx(*idx);
2718 let slot = self.ensure_let_slot(mapped, *ty)?;
2719 // Coerce on bool / null where the producer pushed an i32
2720 // slot but the let-slot was declared as the canonical
2721 // 32-bit width.
2722 let stored = self.coerce_to_let_ty(v, *ty)?;
2723 self.builder
2724 .build_store(slot, stored)
2725 .map_err(|e| LlvmError::Codegen(format!("LetSet store: {e}")))?;
2726 // Phase F.W7 self-recursion fast path: when storing a
2727 // closure handle whose provenance points back at the
2728 // enclosing lambda, remember the let-slot so a later
2729 // `LetGet` resurrects the same provenance. This is
2730 // what bridges the prologue's capture-load chain
2731 // (`LocalGet(0); LoadI32AtAbsolute { offset }; LetSet
2732 // { idx, Closure }`) and the recursive call site
2733 // (`LetGet { idx, Closure }; ...; CallClosure`).
2734 if let Provenance::OwnCaptureHandle {
2735 offset,
2736 self_fn_table_idx,
2737 } = v.prov
2738 {
2739 if matches!(*ty, IrType::Closure) {
2740 self.self_capture_let_slots
2741 .insert(mapped, (offset, self_fn_table_idx));
2742 }
2743 }
2744 // Phase D.2 fast-path entry: when storing a virtualised
2745 // closure produced by an in-body `MakeClosure` (no
2746 // arena/state available), remember the `fn_table_idx`
2747 // so the matching `LetGet` re-emits the provenance and
2748 // the downstream `CallClosure` can rewrite into a
2749 // direct call.
2750 if let Provenance::FastPathClosure { fn_table_idx } = v.prov {
2751 if matches!(*ty, IrType::Closure) {
2752 self.fast_path_closure_let_slots
2753 .insert(mapped, fn_table_idx);
2754 }
2755 }
2756 // Devirtualisation (W18): propagate `KnownClosure`
2757 // across the `LetSet` → `LetGet` chain so a closure
2758 // handle stored into a let then read back at a
2759 // `CallClosure` site keeps its compile-time
2760 // `fn_table_idx`. A `LetSet { Closure }` of any *other*
2761 // provenance overwrites the slot with a value we cannot
2762 // prove is the same single closure, so drop the entry —
2763 // a later `LetGet` then falls back to the runtime
2764 // switch. This invalidation is what keeps a slot that is
2765 // reassigned to a dynamically-chosen closure correct.
2766 match (v.prov, *ty) {
2767 (Provenance::KnownClosure { fn_table_idx }, IrType::Closure) => {
2768 self.known_closure_let_slots.insert(mapped, fn_table_idx);
2769 }
2770 (_, IrType::Closure) => {
2771 self.known_closure_let_slots.remove(&mapped);
2772 }
2773 _ => {}
2774 }
2775 // Phase L W3: propagate `Provenance::ConstString`
2776 // across the `LetSet` → `LetGet` chain so the reduce
2777 // closure's `s` (set every iteration from the same
2778 // const literal "a" in the W3 source) can be picked
2779 // up by `Op::Add(String)` as a const-len operand.
2780 // Any non-const-string `LetSet` against the same idx
2781 // wipes the entry below.
2782 match (v.prov, *ty) {
2783 (Provenance::ConstString { len, first_byte }, IrType::String) => {
2784 self.const_string_let_slots
2785 .insert(mapped, (len, first_byte));
2786 }
2787 (_, IrType::String) => {
2788 // A non-const value just overwrote the slot —
2789 // drop any stale const-string record so a
2790 // later `LetGet` cannot fraudulently claim
2791 // const-len status.
2792 self.const_string_let_slots.remove(&mapped);
2793 }
2794 _ => {}
2795 }
2796 }
2797 Op::LetGet { idx, ty } => {
2798 // Phase E.1: remap the callee's let-idx against the
2799 // active inline frame so concurrent stdlib inlines
2800 // don't clash on slot numbers.
2801 let mapped = self.remap_let_idx(*idx);
2802 let slot = self.ensure_let_slot(mapped, *ty)?;
2803 let llvm_ty: inkwell::types::BasicTypeEnum<'ctx> = match *ty {
2804 // AOT-1: F64 rides as i64 bits, so its let-slot loads
2805 // back as an i64 (the raw bit pattern, reinterpreted
2806 // as `double` only at the arithmetic / store site).
2807 IrType::I64 | IrType::F64 => self.ctx.i64_type().into(),
2808 IrType::I32
2809 | IrType::Bool
2810 | IrType::Unit
2811 | IrType::String
2812 | IrType::ListInt
2813 | IrType::ListFloat
2814 | IrType::ListBool
2815 | IrType::ListString
2816 | IrType::ListSchema
2817 | IrType::ListList
2818 | IrType::Closure
2819 | IrType::Dict => self.ctx.i32_type().into(),
2820 };
2821 let name = self.next_name("letget");
2822 let v = self
2823 .builder
2824 .build_load(llvm_ty, slot, &name)
2825 .map_err(|e| LlvmError::Codegen(format!("LetGet load: {e}")))?
2826 .into_int_value();
2827 // Phase F.W7 self-recursion fast path: when the let-slot
2828 // was populated by the lambda prologue's self-capture
2829 // load chain, re-stamp the matching
2830 // [`Provenance::OwnCaptureHandle`] so the recursive
2831 // call site (which reads the closure handle via
2832 // `LetGet`) keeps the fast-path tag alive.
2833 if matches!(*ty, IrType::Closure) {
2834 if let Some(&(offset, self_fn_table_idx)) =
2835 self.self_capture_let_slots.get(&mapped)
2836 {
2837 self.push_with_prov(
2838 v,
2839 *ty,
2840 Provenance::OwnCaptureHandle {
2841 offset,
2842 self_fn_table_idx,
2843 },
2844 );
2845 } else if let Some(&fn_table_idx) =
2846 self.fast_path_closure_let_slots.get(&mapped)
2847 {
2848 // Phase D.2 fast-path entry: re-stamp the
2849 // virtualised-closure tag so the matching
2850 // `CallClosure` keeps the direct-call rewrite
2851 // available.
2852 self.push_with_prov(v, *ty, Provenance::FastPathClosure { fn_table_idx });
2853 } else if let Some(&fn_table_idx) = self.known_closure_let_slots.get(&mapped) {
2854 // Devirtualisation (W18): re-stamp `KnownClosure`
2855 // so a `CallClosure` reading this handle through
2856 // the let chain emits a direct call (still
2857 // loading the real captures_ptr) instead of the
2858 // runtime switch.
2859 self.push_with_prov(v, *ty, Provenance::KnownClosure { fn_table_idx });
2860 } else {
2861 self.push(v, *ty);
2862 }
2863 } else if matches!(*ty, IrType::String) {
2864 // Phase L W3: re-stamp `Provenance::ConstString`
2865 // when the let-slot is known to hold a value
2866 // sourced from `Op::ConstString`. Crucial for the
2867 // reduce closure's `s` operand — the iter-body
2868 // sets `s` from a const literal then `LetGet`s it
2869 // into the `Op::Add(String)` rhs, so without
2870 // propagation the const-len fast path can never
2871 // fire across the let chain.
2872 if let Some(&(len, first_byte)) = self.const_string_let_slots.get(&mapped) {
2873 self.push_with_prov(v, *ty, Provenance::ConstString { len, first_byte });
2874 } else {
2875 self.push(v, *ty);
2876 }
2877 } else {
2878 self.push(v, *ty);
2879 }
2880 }
2881
2882 // ---- arithmetic ----
2883 Op::Add(ty) => match ty {
2884 // Phase E.1: `Op::Add(IrType::String)` is the
2885 // pair-wise String + String form (the StrConcatN
2886 // fold only fires for compile-time-known chains —
2887 // `reduce("", (acc, s) => acc + s)` lowers to a
2888 // per-iter `Add(String)`).
2889 //
2890 // Phase I (W3 string-concat gap close): emit the
2891 // in-place-append fast path. The W3 reduce hot loop
2892 // walks `acc = acc + "a"` for N iters; under the
2893 // historical inlined-`concat` body that turned into
2894 // an O(N²) byte-copy storm because every iter
2895 // reallocated a fresh scratch record. The new
2896 // helper recognises the "lhs is the most recent
2897 // scratch alloc" case at runtime and extends the
2898 // record in place — total work drops to O(N) bytes,
2899 // matching `String::push_str`. The slow path stays
2900 // bit-identical with the historical lowering so
2901 // mixed-source string adds (const-pool literals,
2902 // out-of-order scratch records) still produce a
2903 // fresh record.
2904 IrType::String => self.emit_str_add_inplace_or_concat(&ip_hint)?,
2905 _ => self.emit_binop(&ip_hint, *ty, BinOp::Add)?,
2906 },
2907 Op::Sub(ty) => self.emit_binop(&ip_hint, *ty, BinOp::Sub)?,
2908 Op::Mul(ty) => self.emit_binop(&ip_hint, *ty, BinOp::Mul)?,
2909 Op::Div(ty) => self.emit_binop(&ip_hint, *ty, BinOp::Div)?,
2910 Op::Mod(ty) => self.emit_binop(&ip_hint, *ty, BinOp::Mod)?,
2911 Op::BitAnd(ty) => self.emit_binop(&ip_hint, *ty, BinOp::BitAnd)?,
2912 Op::ConvertI64ToF64 => self.emit_convert_i64_to_f64(&ip_hint)?,
2913 Op::F64ToI64Sat => self.emit_f64_to_i64_sat(&ip_hint)?,
2914 Op::F64Unary(op) => self.emit_f64_unary(&ip_hint, *op)?,
2915 Op::F64Pow => self.emit_f64_pow(&ip_hint)?,
2916
2917 // ---- comparisons ----
2918 Op::Eq(ty) => self.emit_cmp(&ip_hint, *ty, IntPredicate::EQ)?,
2919 Op::Ne(ty) => self.emit_cmp(&ip_hint, *ty, IntPredicate::NE)?,
2920 Op::Lt(ty) => self.emit_cmp(&ip_hint, *ty, IntPredicate::SLT)?,
2921 Op::Le(ty) => self.emit_cmp(&ip_hint, *ty, IntPredicate::SLE)?,
2922 Op::Gt(ty) => self.emit_cmp(&ip_hint, *ty, IntPredicate::SGT)?,
2923 Op::Ge(ty) => self.emit_cmp(&ip_hint, *ty, IntPredicate::SGE)?,
2924
2925 // ---- buffer-protocol I/O ----
2926 Op::LoadField { offset, ty } => self.emit_load_field(*offset, *ty)?,
2927 Op::StoreField {
2928 offset,
2929 ty,
2930 inplace,
2931 } => self.emit_store_field(&ip_hint, *offset, *ty, *inplace)?,
2932
2933 // ---- pointer-indirect param loads (Phase 2 relon-rs surface) ----
2934 // String / List* `#main` parameters arrive in the input
2935 // buffer as a 4-byte buffer-relative offset to a tail
2936 // record. The IR's lowering pass emits `Op::LoadStringPtr`
2937 // (and its List* siblings) instead of `Op::LoadField {
2938 // ty: String }` so the dispatch stays unambiguous; we
2939 // share the same `emit_load_pointer_indirect_param` impl
2940 // for all variants.
2941 Op::LoadStringPtr { offset } => {
2942 self.emit_load_pointer_indirect_param(*offset, IrType::String)?
2943 }
2944 Op::LoadListIntPtr { offset } => {
2945 self.emit_load_pointer_indirect_param(*offset, IrType::ListInt)?
2946 }
2947 Op::LoadListFloatPtr { offset } => {
2948 self.emit_load_pointer_indirect_param(*offset, IrType::ListFloat)?
2949 }
2950 Op::LoadListBoolPtr { offset } => {
2951 self.emit_load_pointer_indirect_param(*offset, IrType::ListBool)?
2952 }
2953 Op::LoadListStringPtr { offset } => {
2954 self.emit_load_pointer_indirect_param(*offset, IrType::ListString)?
2955 }
2956 Op::LoadListSchemaPtr { offset } => {
2957 self.emit_load_pointer_indirect_param(*offset, IrType::ListSchema)?
2958 }
2959 Op::LoadListListPtr { offset } => {
2960 self.emit_load_pointer_indirect_param(*offset, IrType::ListList)?
2961 }
2962
2963 // ---- ReadStringLen (Phase 2 — backs `length(s)` / `len(xs)`) ----
2964 // Pop arena-relative i32 record pointer, load the leading
2965 // 4-byte length prefix, zext to i64 and push. Used by the
2966 // bundled stdlib `length` (String) / `list_*_length` bodies
2967 // — every list record shares the `[len: u32 LE]` prefix
2968 // with String, so a single lowering covers both.
2969 Op::ReadStringLen => self.emit_read_string_len(&ip_hint)?,
2970
2971 // ---- control flow ----
2972 Op::Block { result_ty, body } => self.emit_block(*result_ty, body)?,
2973 Op::Loop { result_ty, body } => self.emit_loop(*result_ty, body)?,
2974 Op::Br { label_depth } => self.emit_br(*label_depth)?,
2975 Op::BrIf { label_depth } => self.emit_br_if(&ip_hint, *label_depth)?,
2976 Op::If {
2977 result_ty,
2978 then_body,
2979 else_body,
2980 } => self.emit_if(&ip_hint, *result_ty, then_body, else_body)?,
2981
2982 // ---- return ----
2983 Op::Return => self.emit_return(&ip_hint)?,
2984
2985 // ---- Phase E.1: const-data pool ----
2986 Op::ConstString { idx, value } => {
2987 let off = self
2988 .const_pool
2989 .string_offsets
2990 .get(idx)
2991 .copied()
2992 .ok_or_else(|| {
2993 LlvmError::Codegen(format!(
2994 "Op::ConstString {{ idx: {idx} }}: missing const-pool entry — \
2995 did the host forget to lay out the pool blob before dispatch?"
2996 ))
2997 })?;
2998 let c = self.ctx.i32_type().const_int(u64::from(off), false);
2999 // Phase L W3: stamp const-len provenance so the
3000 // downstream `Op::Add(String)` lowering (via
3001 // `emit_str_add_inplace_or_concat`) can use the
3002 // compile-time-known length to elide the per-iter
3003 // `[len]` header reload and replace the rhs memcpy
3004 // with a single byte store when the literal is one
3005 // byte (the dominant cmp_lua W3 reduce shape). The
3006 // provenance only survives across `LetSet`/`LetGet`
3007 // for `IrType::String` (tracked in
3008 // `const_string_let_slots`) so non-String consumers
3009 // never observe it.
3010 let bytes = value.as_bytes();
3011 let len_u32 = u32::try_from(bytes.len()).map_err(|_| {
3012 LlvmError::Codegen("ConstString length exceeds u32 range".into())
3013 })?;
3014 let first_byte = if bytes.len() == 1 {
3015 Some(bytes[0])
3016 } else {
3017 None
3018 };
3019 self.push_with_prov(
3020 c,
3021 IrType::String,
3022 Provenance::ConstString {
3023 len: len_u32,
3024 first_byte,
3025 },
3026 );
3027 // Phase H peek-state: record the literal bytes so the
3028 // next `lower_op` call can detect `Op::Call(contains)`
3029 // with this string still at top-of-stack and switch
3030 // to the inline byte-scan instead of the extern shim.
3031 // Cleared at the start of every `lower_op` — see the
3032 // `prev_const_string.take()` line at the dispatch
3033 // head — so a single intervening op (Push / Pop /
3034 // Add / ...) drops the optimisation cleanly.
3035 self.last_const_string = Some(bytes.to_vec());
3036 }
3037
3038 // ---- Phase E.1: raw-memory primitives ----
3039 Op::LoadI32AtAbsolute { offset } => {
3040 // Phase F.W7 self-recursion fast path: when the base
3041 // (top-of-stack at this point) is the lambda's own
3042 // captures_ptr arg and the offset matches a recorded
3043 // self-recursive capture slot, the result is a
3044 // closure handle whose backing struct points back at
3045 // the enclosing lambda. Stash the provenance hint
3046 // so the downstream `LetSet/LetGet/CallClosure` chain
3047 // can short-circuit the indirect dispatch. The
3048 // sniff peeks at the stack-top without mutating it;
3049 // the actual load still flows through
3050 // `emit_load_at_absolute` so we don't fork the
3051 // raw-memory primitive's lowering.
3052 let prov_hint = self.peek_self_capture_provenance(*offset);
3053 self.emit_load_at_absolute(&ip_hint, *offset, AbsLoad::I32)?;
3054 if let Some(prov) = prov_hint {
3055 if let Some(top) = self.stack.last_mut() {
3056 top.prov = prov;
3057 }
3058 }
3059 }
3060 Op::LoadI64AtAbsolute { offset } => {
3061 self.emit_load_at_absolute(&ip_hint, *offset, AbsLoad::I64)?
3062 }
3063 Op::LoadI8UAtAbsolute { offset } => {
3064 self.emit_load_at_absolute(&ip_hint, *offset, AbsLoad::I8U)?
3065 }
3066 Op::LoadF64AtAbsolute { offset } => {
3067 self.emit_load_at_absolute(&ip_hint, *offset, AbsLoad::F64)?
3068 }
3069 Op::StoreI32AtAbsolute { offset } => {
3070 self.emit_store_at_absolute(&ip_hint, *offset, AbsStore::I32)?
3071 }
3072 Op::StoreI64AtAbsolute { offset } => {
3073 self.emit_store_at_absolute(&ip_hint, *offset, AbsStore::I64)?
3074 }
3075 Op::StoreI8AtAbsolute { offset } => {
3076 self.emit_store_at_absolute(&ip_hint, *offset, AbsStore::I8)?
3077 }
3078 Op::StoreF64AtAbsolute { offset } => {
3079 self.emit_store_at_absolute(&ip_hint, *offset, AbsStore::F64)?
3080 }
3081 Op::MemcpyAtAbsolute => self.emit_memcpy_at_absolute(&ip_hint)?,
3082 Op::AllocScratch { size_bytes } => self.emit_alloc_scratch_static(*size_bytes)?,
3083 Op::AllocScratchDyn => self.emit_alloc_scratch_dyn(&ip_hint)?,
3084 Op::StrConcatN { operand_count } => self.emit_str_concat_n(&ip_hint, *operand_count)?,
3085 Op::IntToStr => self.emit_int_to_str(&ip_hint)?,
3086 Op::FloatToStr => self.emit_float_to_str(&ip_hint)?,
3087
3088 // ---- Phase E.1 + E.2 call dispatch ----
3089 // stdlib indices (#278) route through the bundled-body
3090 // inline path (`emit_call_stdlib`); user-defined indices
3091 // (#279) resolve through the helper table populated by
3092 // `emit_module_funcs`.
3093 Op::Call {
3094 fn_index,
3095 arg_count,
3096 param_tys,
3097 ret_ty,
3098 } => {
3099 let stdlib_count = relon_ir::stdlib::stdlib_function_count();
3100 // Phase F.1: `contains(haystack, needle) -> Bool` short-
3101 // circuit. The bundled stdlib body is a hand-transcribed
3102 // O(s_len * p_len) byte scan that defeats LLVM's auto-
3103 // vectoriser on the inner compare loop (every iter
3104 // reloads the needle bytes through a let-slot). On the
3105 // W4 / W4_long cmp_lua rows that turns into a 3.4× /
3106 // 256× gap vs LuaJIT (which uses SIMD-accelerated
3107 // `string.find`). Route the call through the host shim
3108 // `relon_llvm_str_contains_arena` which defers to
3109 // `core::str::contains` — std's substring search backs
3110 // single-byte needles with SIMD `memchr` and uses a
3111 // Two-Way matcher for longer needles, closing the gap
3112 // without inventing a Relon-specific SIMD path.
3113 if *fn_index < stdlib_count
3114 && relon_ir::stdlib::stdlib_function_index("contains") == Some(*fn_index)
3115 && *arg_count == 2
3116 && param_tys == &[IrType::String, IrType::String]
3117 && *ret_ty == IrType::Bool
3118 {
3119 // Phase H: when the needle was pushed by the
3120 // immediately-preceding `Op::ConstString` (peek
3121 // state populated at `lower_op` head), inline a
3122 // tight byte-scan against the literal bytes.
3123 // Skips the `relon_llvm_str_contains_arena` FFI
3124 // boundary entirely — ~10-15 cycles of prologue /
3125 // epilogue / IC atomic loads per call. The W4 /
3126 // W4_long hot loops always hit this path (needle
3127 // = `"x"` literal); dynamic-needle callers (e.g.
3128 // `filter((s) => s.contains(other))` where
3129 // `other` flows in via an outer let-slot) fall
3130 // through to the existing Phase G extern shim.
3131 if let Some(needle_bytes) = prev_const_string.as_deref() {
3132 self.emit_str_contains_const_needle(&ip_hint, needle_bytes)?;
3133 } else {
3134 self.emit_str_contains_extern(&ip_hint)?;
3135 }
3136 } else if *fn_index < stdlib_count {
3137 self.emit_call_stdlib(&ip_hint, *fn_index, *arg_count, param_tys, *ret_ty)?
3138 } else {
3139 self.emit_call(&ip_hint, *fn_index, *arg_count, param_tys, *ret_ty)?
3140 }
3141 }
3142
3143 // ---- Phase F.W7: anon-Dict-return record ops ----
3144 // The IR lowering pass uses `AllocRootRecord` to bind a
3145 // per-record-local i32 alloca to `0` (the root sits at
3146 // `out_ptr + 0`); subsequent `StoreFieldAtRecord` ops use
3147 // the alloca-resident offset to compute the destination
3148 // address in the output buffer's fixed area.
3149 Op::AllocRootRecord { record_local_idx } => {
3150 self.emit_alloc_root_record(*record_local_idx)?
3151 }
3152 Op::StoreFieldAtRecord {
3153 record_local_idx,
3154 offset,
3155 ty,
3156 } => self.emit_store_field_at_record(&ip_hint, *record_local_idx, *offset, *ty)?,
3157
3158 // ---- Phase F.W7: closure-as-value primitives ----
3159 Op::MakeClosure {
3160 fn_table_idx,
3161 captures,
3162 captures_size,
3163 } => self.emit_make_closure(&ip_hint, *fn_table_idx, captures, *captures_size)?,
3164 Op::CallClosure { param_tys, ret_ty } => {
3165 self.emit_call_closure(&ip_hint, param_tys, *ret_ty)?
3166 }
3167
3168 // ---- Phase 0b family seams ----
3169 // The ops below are not yet lowered by the LLVM AOT backend.
3170 // They are listed EXPLICITLY (no `_ =>` wildcard) so that
3171 // adding a new `Op` variant fails to compile here — forcing a
3172 // deliberate decision instead of a silent runtime codegen
3173 // error. Each group delegates to a thin per-family entry
3174 // point living in that family's `codegen/<family>.rs` file,
3175 // so Phase 0b agents fill one family file each WITHOUT
3176 // touching this shared dispatch (zero merge conflicts). The
3177 // stubs return the same `LlvmError::Codegen` the catch-all
3178 // used to, so today's fallback behaviour is unchanged.
3179
3180 // collections.rs — list/dict/sub-record construction
3181 Op::ConstListInt { .. }
3182 | Op::ConstListFloat { .. }
3183 | Op::ConstListBool { .. }
3184 | Op::ConstListString { .. }
3185 | Op::ConstDict { .. }
3186 | Op::DictGetByStringKey { .. }
3187 | Op::ListGetByIntIdx { .. }
3188 | Op::AllocSubRecord { .. }
3189 | Op::AllocScratchRecord { .. }
3190 | Op::PushRecordBase { .. }
3191 | Op::PushRecordBaseAbsolute { .. }
3192 | Op::StoreFieldAtRecordAbsolute { .. }
3193 | Op::EmitTailRecordFromAbsoluteAddr { .. }
3194 | Op::BuildVariantRecord { .. }
3195 | Op::BuildVariantRecordScratch { .. }
3196 | Op::BuildPointerList { .. } => {
3197 self.lower_collections_rest(ip, &ip_hint, &tagged.op)?
3198 }
3199
3200 // control.rs — multi-way / select control flow
3201 Op::Select { .. } | Op::BrTable { .. } => {
3202 self.lower_control_rest(ip, &ip_hint, &tagged.op)?
3203 }
3204
3205 // mem.rs — absolute-addressed field load
3206 Op::LoadFieldAtAbsolute { .. } => self.lower_mem_rest(ip, &ip_hint, &tagged.op)?,
3207
3208 // call.rs — native dispatch + capability gate + trap
3209 Op::CallNative { .. } | Op::CheckCap { .. } | Op::Trap { .. } => {
3210 self.lower_call_rest(ip, &ip_hint, &tagged.op)?
3211 }
3212
3213 // schema.rs — schema pointer / method dispatch
3214 Op::LoadSchemaPtr { .. } => self.lower_schema_rest(ip, &ip_hint, &tagged.op)?,
3215
3216 // unicode.rs — *TableAddr long tail
3217 Op::CaseFoldTableAddr { .. }
3218 | Op::CombiningMarkRangesAddr
3219 | Op::WhitespaceRangesAddr
3220 | Op::DecompTableAddr { .. }
3221 | Op::CccTableAddr
3222 | Op::CompositionTableAddr
3223 | Op::FullCaseFoldTableAddr { .. }
3224 | Op::CasedRangesAddr
3225 | Op::CaseIgnorableRangesAddr
3226 | Op::TurkishCaseFoldTableAddr { .. } => {
3227 self.lower_unicode_rest(ip, &ip_hint, &tagged.op)?
3228 }
3229 }
3230 Ok(())
3231 }
3232
3233 // -- Phase E.1: inline-call frame helpers --------------------------
3234
3235 /// Translate a callee `LetGet/LetSet` index against the topmost
3236 /// inline frame. Mirrors cranelift's `remap_let_idx`.
3237 pub(crate) fn remap_let_idx(&self, idx: u32) -> u32 {
3238 match self.inline_frames.last() {
3239 Some(frame) => frame.let_offset.saturating_add(idx),
3240 None => idx,
3241 }
3242 }
3243
3244 // -- helpers --------------------------------------------------------
3245
3246 pub(crate) fn coerce_to_let_ty(
3247 &self,
3248 tv: TypedValue<'ctx>,
3249 target: IrType,
3250 ) -> Result<BasicValueEnum<'ctx>, LlvmError> {
3251 let want_width = match target {
3252 // AOT-1: F64 rides as i64 bits, so its let-slot is 64-wide
3253 // (same as I64). Coercion stays a width match — never an
3254 // int<->float cast — because the stack value is the raw
3255 // bit pattern, not a `double`.
3256 IrType::I64 | IrType::F64 => 64,
3257 IrType::I32
3258 | IrType::Bool
3259 | IrType::Unit
3260 | IrType::String
3261 | IrType::ListInt
3262 | IrType::ListFloat
3263 | IrType::ListBool
3264 | IrType::ListString
3265 | IrType::ListSchema
3266 | IrType::ListList
3267 | IrType::Closure
3268 | IrType::Dict => 32,
3269 };
3270 let have_width = tv.val.get_type().get_bit_width();
3271 if have_width == want_width {
3272 return Ok(tv.val.into());
3273 }
3274 let target_ty = if want_width == 64 {
3275 self.ctx.i64_type()
3276 } else {
3277 self.ctx.i32_type()
3278 };
3279 if have_width < want_width {
3280 self.builder
3281 .build_int_z_extend(tv.val, target_ty, "let_zext")
3282 .map(|v| v.as_basic_value_enum())
3283 .map_err(|e| LlvmError::Codegen(format!("let zext: {e}")))
3284 } else {
3285 self.builder
3286 .build_int_truncate(tv.val, target_ty, "let_trunc")
3287 .map(|v| v.as_basic_value_enum())
3288 .map_err(|e| LlvmError::Codegen(format!("let trunc: {e}")))
3289 }
3290 }
3291
3292 // -- control flow ---------------------------------------------------
3293}
3294
3295/// Inline lookup table used by `emit_load_field`. Picks the LLVM
3296/// integer type + the IR tag we push back onto the operand stack
3297/// for a Phase-B-supported scalar field type.
3298impl<'ctx, 'b, 'cp> Emit<'ctx, 'b, 'cp> {}
3299
3300// ---------------------------------------------------------------------------
3301// Phase E.1: raw-memory primitives, scratch allocator, StrConcatN.
3302// ---------------------------------------------------------------------------
3303
3304impl<'ctx, 'b, 'cp> Emit<'ctx, 'b, 'cp> {
3305 /// Map an `IrType` to the LLVM int type used for the operand stack
3306 /// representation. Used by `Op::MakeClosure` capture reads and
3307 /// `Op::CallClosure` return loads.
3308 pub(crate) fn ir_ty_to_llvm_int(
3309 &self,
3310 ty: IrType,
3311 ) -> Result<inkwell::types::IntType<'ctx>, LlvmError> {
3312 match ty {
3313 IrType::I64 | IrType::F64 => Ok(self.ctx.i64_type()),
3314 IrType::I32
3315 | IrType::Bool
3316 | IrType::Unit
3317 | IrType::String
3318 | IrType::ListInt
3319 | IrType::ListFloat
3320 | IrType::ListBool
3321 | IrType::ListString
3322 | IrType::ListSchema
3323 | IrType::ListList
3324 | IrType::Closure
3325 | IrType::Dict => Ok(self.ctx.i32_type()),
3326 }
3327 }
3328}
3329
3330#[cfg(test)]
3331mod const_pool_tests {
3332 //! Byte-level layout pins for the `ConstList*` const-pool records.
3333 //!
3334 //! These are the cross-backend arena data contract: the bytes the
3335 //! LLVM `ConstPool` lays out for `ConstListInt` / `ConstListFloat`
3336 //! / `ConstListBool` must be byte-identical to what
3337 //! `relon_codegen_cranelift`'s `ConstPool::visit_const_list_*`
3338 //! produces (both backends copy the same blob into the arena
3339 //! prefix; a layout drift on one side silently corrupts the other's
3340 //! cached ET_REL). Both ConstPools are crate-private, so the
3341 //! parity is pinned here against the documented wire layout the
3342 //! cranelift `visit_const_list_*` port was matched to:
3343 //!
3344 //! * int / float: align 8, `[len: u32 LE][pad: u32 zero][i64/f64 LE]`
3345 //! * bool: align 4, `[len: u32 LE][u8 0/1 tightly packed]`
3346 use super::*;
3347 use relon_ir::ir::{Func, Op, TaggedOp};
3348 use relon_parser::TokenRange;
3349
3350 fn tagged(op: Op) -> TaggedOp {
3351 TaggedOp {
3352 op,
3353 range: TokenRange::default(),
3354 }
3355 }
3356
3357 fn synth_module(body: Vec<TaggedOp>) -> IrModule {
3358 IrModule {
3359 funcs: vec![Func {
3360 name: "run_main".into(),
3361 params: vec![],
3362 ret: IrType::I64,
3363 body,
3364 range: TokenRange::default(),
3365 }],
3366 entry_func_index: Some(0),
3367 imports: vec![],
3368 closure_table: vec![],
3369 }
3370 }
3371
3372 #[test]
3373 fn const_list_int_byte_layout() {
3374 let pool = ConstPool::from_module(&synth_module(vec![tagged(Op::ConstListInt {
3375 idx: 0,
3376 elements: vec![10, 20, 30],
3377 })]))
3378 .unwrap();
3379 assert_eq!(pool.list_int_offsets.get(&0).copied(), Some(0));
3380 // [len:u32=3][pad:4 zero][i64 x3 LE]
3381 assert_eq!(&pool.bytes[0..4], &3u32.to_le_bytes());
3382 assert_eq!(&pool.bytes[4..8], &[0u8; 4]);
3383 assert_eq!(&pool.bytes[8..16], &10i64.to_le_bytes());
3384 assert_eq!(&pool.bytes[16..24], &20i64.to_le_bytes());
3385 assert_eq!(&pool.bytes[24..32], &30i64.to_le_bytes());
3386 assert_eq!(pool.bytes.len(), 32);
3387 }
3388
3389 #[test]
3390 fn const_list_float_byte_layout() {
3391 // f64 elements carried as their u64 LE bit-pattern (matches the
3392 // IR's `ConstListFloat { elements: Vec<u64> }` representation).
3393 let f0 = 1.5f64.to_bits();
3394 let f1 = (-2.0f64).to_bits();
3395 let pool = ConstPool::from_module(&synth_module(vec![tagged(Op::ConstListFloat {
3396 idx: 0,
3397 elements: vec![f0, f1],
3398 })]))
3399 .unwrap();
3400 assert_eq!(pool.list_float_offsets.get(&0).copied(), Some(0));
3401 assert_eq!(&pool.bytes[0..4], &2u32.to_le_bytes());
3402 assert_eq!(&pool.bytes[4..8], &[0u8; 4]);
3403 assert_eq!(&pool.bytes[8..16], &f0.to_le_bytes());
3404 assert_eq!(&pool.bytes[16..24], &f1.to_le_bytes());
3405 assert_eq!(pool.bytes.len(), 24);
3406 }
3407
3408 #[test]
3409 fn const_list_bool_byte_layout() {
3410 let pool = ConstPool::from_module(&synth_module(vec![tagged(Op::ConstListBool {
3411 idx: 0,
3412 elements: vec![true, false, true],
3413 })]))
3414 .unwrap();
3415 assert_eq!(pool.list_bool_offsets.get(&0).copied(), Some(0));
3416 // [len:u32=3][1,0,1] tightly packed, no padding between bytes
3417 assert_eq!(&pool.bytes[0..4], &3u32.to_le_bytes());
3418 assert_eq!(&pool.bytes[4..7], &[1u8, 0, 1]);
3419 assert_eq!(pool.bytes.len(), 7);
3420 }
3421
3422 #[test]
3423 fn const_list_alignment_across_records() {
3424 // A bool record (len 4 + 3 = 7 bytes, align-4 padding to 8)
3425 // followed by an int record must land the int header on an
3426 // 8-byte boundary so the i64 payload is 8-aligned.
3427 let pool = ConstPool::from_module(&synth_module(vec![
3428 tagged(Op::ConstListBool {
3429 idx: 0,
3430 elements: vec![true, false, true],
3431 }),
3432 tagged(Op::ConstListInt {
3433 idx: 1,
3434 elements: vec![42],
3435 }),
3436 ]))
3437 .unwrap();
3438 assert_eq!(pool.list_bool_offsets.get(&0).copied(), Some(0));
3439 // 7 bytes used → align_to(8) pads to offset 8 for the int record.
3440 assert_eq!(pool.list_int_offsets.get(&1).copied(), Some(8));
3441 assert_eq!(&pool.bytes[8..12], &1u32.to_le_bytes());
3442 assert_eq!(&pool.bytes[16..24], &42i64.to_le_bytes());
3443 }
3444
3445 #[test]
3446 fn const_list_string_byte_layout() {
3447 // W5-P2 pointer-array layout. Elements "a","bb","ccc":
3448 // String records first (4-aligned):
3449 // off 0: [slen=1]["a"] -> 5 bytes, pad to 8
3450 // off 8: [slen=2]["bb"] -> 6 bytes, pad to 16
3451 // off 16: [slen=3]["ccc"] -> 7 bytes, pad to 24
3452 // header at off 24:
3453 // [len=3][off_0=0][off_1=8][off_2=16]
3454 let pool = ConstPool::from_module(&synth_module(vec![tagged(Op::ConstListString {
3455 idx: 0,
3456 elements: vec!["a".into(), "bb".into(), "ccc".into()],
3457 })]))
3458 .unwrap();
3459 // String record "a" at offset 0.
3460 assert_eq!(&pool.bytes[0..4], &1u32.to_le_bytes());
3461 assert_eq!(&pool.bytes[4..5], b"a");
3462 // "bb" at offset 8 (4-aligned after the 5-byte "a" record).
3463 assert_eq!(&pool.bytes[8..12], &2u32.to_le_bytes());
3464 assert_eq!(&pool.bytes[12..14], b"bb");
3465 // "ccc" at offset 16.
3466 assert_eq!(&pool.bytes[16..20], &3u32.to_le_bytes());
3467 assert_eq!(&pool.bytes[20..23], b"ccc");
3468 // Header at offset 24.
3469 let h = pool.list_string_offsets.get(&0).copied();
3470 assert_eq!(h, Some(24));
3471 assert_eq!(&pool.bytes[24..28], &3u32.to_le_bytes());
3472 assert_eq!(&pool.bytes[28..32], &0u32.to_le_bytes());
3473 assert_eq!(&pool.bytes[32..36], &8u32.to_le_bytes());
3474 assert_eq!(&pool.bytes[36..40], &16u32.to_le_bytes());
3475 assert_eq!(pool.bytes.len(), 40);
3476 }
3477
3478 #[test]
3479 fn duplicate_const_list_idx_is_noop() {
3480 let pool = ConstPool::from_module(&synth_module(vec![
3481 tagged(Op::ConstListInt {
3482 idx: 0,
3483 elements: vec![1, 2],
3484 }),
3485 tagged(Op::ConstListInt {
3486 idx: 0,
3487 elements: vec![1, 2],
3488 }),
3489 ]))
3490 .unwrap();
3491 // One record only: 8 header + 2*8 payload = 24.
3492 assert_eq!(pool.bytes.len(), 24);
3493 }
3494}
3495
3496#[cfg(test)]
3497mod devirt_tests {
3498 //! Soundness unit tests for the W18 closure-devirtualisation
3499 //! capture analysis. These exercise the IR-scan that decides which
3500 //! captures may be stamped `KnownClosure` (→ direct call) vs left as
3501 //! a genuinely-dynamic dispatch (→ runtime switch). Getting this
3502 //! wrong is a silent miscompile, so the analysis is pinned here
3503 //! independent of any end-to-end source.
3504 use super::*;
3505 use relon_ir::ir::{ClosureCapture, Func, IrType, Op, TaggedOp};
3506 use relon_parser::TokenRange;
3507
3508 fn op(o: Op) -> TaggedOp {
3509 TaggedOp {
3510 op: o,
3511 range: TokenRange::default(),
3512 }
3513 }
3514
3515 fn make_closure(fn_table_idx: u32, captures: Vec<ClosureCapture>) -> Op {
3516 let captures_size = captures.iter().map(|c| c.offset + 8).max().unwrap_or(0);
3517 Op::MakeClosure {
3518 fn_table_idx,
3519 captures,
3520 captures_size,
3521 }
3522 }
3523
3524 fn cap(let_idx: u32, offset: u32) -> ClosureCapture {
3525 ClosureCapture {
3526 let_idx,
3527 ty: IrType::Closure,
3528 offset,
3529 }
3530 }
3531
3532 fn entry_with_body(body: Vec<TaggedOp>) -> Func {
3533 Func {
3534 name: "run_main".into(),
3535 params: vec![IrType::I32],
3536 ret: IrType::I32,
3537 body,
3538 range: TokenRange::default(),
3539 }
3540 }
3541
3542 /// A capture of a *known, non-self* closure is recorded so the
3543 /// capturing lambda's body can devirtualise the call against it.
3544 /// Mirrors the W18 predicate `(k) => is_prime(k, 2)` capturing the
3545 /// `is_prime` closure (`fn_table_idx=0`).
3546 #[test]
3547 fn records_known_non_self_capture() {
3548 // let0 := MakeClosure(K=0) ; the `is_prime` binding
3549 // MakeClosure(L=1) capturing let0 at offset 0 ; the predicate
3550 let body = vec![
3551 op(make_closure(0, vec![cap(0, 0)])), // is_prime self-capture
3552 op(Op::LetSet {
3553 idx: 0,
3554 ty: IrType::Closure,
3555 }),
3556 op(make_closure(1, vec![cap(0, 0)])), // predicate captures is_prime
3557 op(Op::Call {
3558 fn_index: 14,
3559 arg_count: 2,
3560 param_tys: vec![IrType::ListInt, IrType::Closure],
3561 ret_ty: IrType::ListInt,
3562 }),
3563 ];
3564 let entry = entry_with_body(body);
3565 let table = build_known_capture_table(&entry, &[], &[]);
3566 // Lambda L=1 (the predicate) captures known closure K=0 at
3567 // offset 0.
3568 assert_eq!(
3569 table.get(&1).map(Vec::as_slice),
3570 Some(&[(0u32, 0u32)][..]),
3571 "predicate (L=1) must record its is_prime (K=0) capture as known"
3572 );
3573 // L=0 is_prime's own capture is a SELF capture (K==L==0) — it
3574 // must NOT appear here (the self-capture table owns it, and its
3575 // captures_ptr-reuse direct path is strictly better).
3576 assert!(
3577 !table.contains_key(&0),
3578 "self-capture (K==L) must be excluded from the known-capture table"
3579 );
3580 }
3581
3582 /// When a closure let-slot is reassigned to a value that is NOT a
3583 /// literal `MakeClosure` (a genuinely-dynamic closure), the capture
3584 /// must NOT be recorded — the body keeps the runtime switch. This is
3585 /// the correctness red line: devirtualise only a provably-unique
3586 /// callee.
3587 #[test]
3588 fn drops_reassigned_dynamic_closure_slot() {
3589 // let0 := MakeClosure(0) ; known
3590 // let0 := <some other Closure> ; reassigned, now dynamic
3591 // MakeClosure(2) capturing let0 ; must NOT be recorded
3592 let body = vec![
3593 op(make_closure(0, vec![cap(0, 0)])),
3594 op(Op::LetSet {
3595 idx: 0,
3596 ty: IrType::Closure,
3597 }),
3598 // A bare `LetSet { Closure }` NOT preceded by a MakeClosure —
3599 // models a closure that arrived from somewhere unprovable
3600 // (a param, a phi, a different binding).
3601 op(Op::LetGet {
3602 idx: 5,
3603 ty: IrType::Closure,
3604 }),
3605 op(Op::LetSet {
3606 idx: 0,
3607 ty: IrType::Closure,
3608 }),
3609 op(make_closure(2, vec![cap(0, 0)])),
3610 op(Op::LetSet {
3611 idx: 9,
3612 ty: IrType::Closure,
3613 }),
3614 ];
3615 let entry = entry_with_body(body);
3616 let table = build_known_capture_table(&entry, &[], &[]);
3617 assert!(
3618 !table.contains_key(&2),
3619 "a capture of a reassigned (dynamic) closure slot must NOT be \
3620 recorded — the call must keep the runtime switch"
3621 );
3622 }
3623
3624 /// The binding `LetSet` that immediately follows a known
3625 /// `MakeClosure` must NOT clear the slot it just established (the
3626 /// ordering bug fixed during development). A later capture of that
3627 /// slot is still recorded.
3628 #[test]
3629 fn binding_letset_does_not_clear_its_own_slot() {
3630 let body = vec![
3631 op(make_closure(3, vec![])),
3632 op(Op::LetSet {
3633 idx: 7,
3634 ty: IrType::Closure,
3635 }),
3636 op(make_closure(4, vec![cap(7, 0)])),
3637 op(Op::LetSet {
3638 idx: 8,
3639 ty: IrType::Closure,
3640 }),
3641 ];
3642 let entry = entry_with_body(body);
3643 let table = build_known_capture_table(&entry, &[], &[]);
3644 assert_eq!(
3645 table.get(&4).map(Vec::as_slice),
3646 Some(&[(0u32, 3u32)][..]),
3647 "L=4 must record its capture of known closure K=3 at offset 0"
3648 );
3649 }
3650}