relon_codegen_llvm/
evaluator.rs

1//! Runtime façade for the LLVM AOT backend.
2//!
3//! Phase B widens the evaluator past the bootstrap envelope:
4//!
5//! - [`LlvmAotEvaluator::from_ir_direct`] keeps the legacy-i64 entry
6//!   shape (`(I64...) -> I64`) for hand-built IR fixtures and the
7//!   side-by-side `from_ir_direct` benches.
8//! - [`LlvmAotEvaluator::from_source`] drives the full
9//!   parse + analyze + `lower_workspace_single` + LLVM emit + JIT
10//!   pipeline. Matches the cranelift backend's `from_source` shape
11//!   so a host can swap the two evaluators by changing the
12//!   constructor name.
13//!
14//! ## Why MCJIT (and not ORC) for Phase B
15//!
16//! - MCJIT is the simplest engine that inkwell exposes — single
17//!   `create_jit_execution_engine` call, no per-symbol resolver
18//!   plumbing. The Phase B goal is W1 / W2 production-source parity,
19//!   not throughput.
20//! - inkwell 0.9.0 wraps both engines, so switching to ORC in
21//!   Phase C is a localised diff: one call site here, the emitter
22//!   stays untouched.
23//! - LLVM 18's MCJIT still handles the W1 / W2 hot path (single
24//!   function, no global state, no external symbols).
25
26use std::cell::RefCell;
27use std::collections::HashMap;
28use std::sync::atomic::{AtomicI64, Ordering};
29use std::sync::Arc;
30
31use inkwell::context::Context;
32use inkwell::execution_engine::ExecutionEngine;
33use inkwell::passes::PassBuilderOptions;
34use inkwell::targets::{
35    CodeModel, InitializationConfig, RelocMode, Target, TargetMachine, TargetTriple,
36};
37use inkwell::OptimizationLevel;
38
39use relon_eval_api::inplace_return::ArenaRegions;
40use relon_eval_api::{ClosureData, Evaluator, RuntimeError, Scope, Thunk, Value};
41use relon_parser::Node;
42
43use crate::codegen::{
44    emit_fast_entry, emit_module_funcs, emit_module_funcs_closed_world,
45    emit_module_funcs_closed_world_wasm, emit_module_funcs_wasm, is_buffer_protocol_signature,
46    ConstPool, EntryShape, FastPathProfile, WorldMode, ENTRY_SYMBOL, ENTRY_SYMBOL_FAST,
47};
48use crate::error::LlvmError;
49use crate::state::ArenaState;
50use crate::str_helpers::RELON_LLVM_STR_CONTAINS_ARENA_SYMBOL;
51use inkwell::module::Linkage;
52use inkwell::targets::FileType;
53use inkwell::values::FunctionValue;
54use std::path::Path;
55
56/// Maximum positional arity supported by the Phase A legacy-i64
57/// entry. Mirrors the cranelift crate's `MAX_LEGACY_ARITY`; the four
58/// slots cover every helloworld-style body in the Phase A bootstrap
59/// + benchmarks.
60///
61/// Phase B adds the buffer-protocol path on top — that path is not
62/// arity-capped because every IR arg flows through the buffer rather
63/// than positional slots.
64pub const MAX_LEGACY_ARITY: usize = 4;
65
66/// Codegen target for the object-emit path (S3.X).
67///
68/// The SAME relon-IR → LLVM-IR emitter feeds both variants — only the
69/// `TargetMachine` construction (triple + DataLayout + CPU/features +
70/// reloc/code model) differs. `mem.rs` already lays out the arena via
71/// i32-offset GEPs (zext-i64 + `i8*` base), so the lowered body is
72/// pointer-width agnostic and needs no per-target change.
73#[derive(Debug, Clone, Copy, PartialEq, Eq)]
74pub enum CodegenTarget {
75    /// Host x86-64 ELF object (the historical default). Triple +
76    /// CPU/features come from `TargetMachine::get_default_triple` /
77    /// `get_host_cpu_*`, reloc = PIC.
78    Native,
79    /// `wasm32-wasi` object (`\0asm` magic). Uses the WebAssembly LLVM
80    /// backend with the canonical wasm32 DataLayout. Emitted object is
81    /// consumed by `wasmtime` (see `crate::wasm_run`).
82    Wasm32,
83}
84
85/// Reference: the wasm32 DataLayout string LLVM emits for
86/// `wasm32-wasi` (little-endian, 32-bit pointers, i64 8-byte aligned).
87/// The module DataLayout is set authoritatively from the
88/// `TargetMachine`'s target data at emit time; this const documents the
89/// expected shape — note the `p:32:32` that lowers the i32-offset arena
90/// GEPs to 32-bit linear-memory pointers.
91#[allow(dead_code)]
92const WASM32_DATA_LAYOUT: &str = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20";
93/// The wasm32 triple. `wasm32-wasi` so the module can later route
94/// effectful host fns through WASI imports (P3 §2.2). For pure-compute
95/// workloads `wasm32-unknown-unknown` would also work; wasi is the
96/// superset.
97const WASM32_TRIPLE: &str = "wasm32-wasi";
98
99// `extern "C"` function pointer aliases for the legacy-i64 entry.
100// Five i64 slots accept the v5-β-1 envelope's max arity; shorter
101// signatures pass zero in the trailing slots — the emitter only
102// declares the parameters the IR has, so unused trailing slots are
103// dead-on-arrival.
104type LegacyEntryFn4 = unsafe extern "C" fn(i64, i64, i64, i64) -> i64;
105type LegacyEntryFn3 = unsafe extern "C" fn(i64, i64, i64) -> i64;
106type LegacyEntryFn2 = unsafe extern "C" fn(i64, i64) -> i64;
107type LegacyEntryFn1 = unsafe extern "C" fn(i64) -> i64;
108type LegacyEntryFn0 = unsafe extern "C" fn() -> i64;
109
110/// `extern "C"` function pointer for the buffer-protocol entry. The
111/// state pointer comes first to match the cranelift backend's
112/// `BufferEntryFn` so the two evaluators share dispatch shape.
113type BufferEntryFn = unsafe extern "C" fn(
114    *const ArenaState,
115    i32, // in_ptr
116    i32, // in_len
117    i32, // out_ptr
118    i32, // out_cap
119    i64, // caps
120) -> i32;
121
122// Phase D.1 fast-path typed entries. Arity-specialised C ABI shapes
123// up to 8 args — the arity cap matches `emit_fast_entry`'s envelope.
124type FastEntryFn0 = unsafe extern "C" fn() -> i64;
125type FastEntryFn1 = unsafe extern "C" fn(i64) -> i64;
126type FastEntryFn2 = unsafe extern "C" fn(i64, i64) -> i64;
127type FastEntryFn3 = unsafe extern "C" fn(i64, i64, i64) -> i64;
128type FastEntryFn4 = unsafe extern "C" fn(i64, i64, i64, i64) -> i64;
129type FastEntryFn5 = unsafe extern "C" fn(i64, i64, i64, i64, i64) -> i64;
130type FastEntryFn6 = unsafe extern "C" fn(i64, i64, i64, i64, i64, i64) -> i64;
131type FastEntryFn7 = unsafe extern "C" fn(i64, i64, i64, i64, i64, i64, i64) -> i64;
132type FastEntryFn8 = unsafe extern "C" fn(i64, i64, i64, i64, i64, i64, i64, i64) -> i64;
133
134/// Owned LLVM JIT state for a single compiled module. The
135/// [`Context`] / [`ExecutionEngine`] pair must outlive every call
136/// into the JITted function pointer; we park them on the heap behind
137/// the evaluator so the host can ignore lifetimes.
138struct JitOwned {
139    // The `Context` must outlive the ExecutionEngine; we keep it in a
140    // pinned heap slot so the engine's borrow stays valid for the
141    // evaluator's lifetime.
142    _engine: ExecutionEngine<'static>,
143    /// Raw entry function pointer resolved once at construction time.
144    /// Cached so the hot path is a single indirect call (matches the
145    /// cranelift backend's `LegacyEntryFn` stash).
146    entry_ptr: usize,
147    /// Phase D.1: typed fast entry pointer resolved at construction
148    /// time when the source qualifies for the dispatch-boundary fast
149    /// path. `None` when the IR fails to lower against the fast
150    /// envelope (string ops, sandbox traps, etc.) — `run_main` falls
151    /// back to the buffer entry transparently in that case.
152    fast_entry_ptr: Option<usize>,
153    /// Pre-rendered textual LLVM IR. inkwell 0.9's
154    /// `ExecutionEngine::get_module*` is missing, so the dump-time
155    /// call cannot reach back to the live module — we pay the
156    /// `print_to_string` cost up-front.
157    ir_dump: String,
158    _ctx: Box<Context>,
159}
160
161// SAFETY: the inkwell ExecutionEngine + Context pair is not `Sync`
162// by default — LLVM's `LLVMContextRef` is per-thread. We mark the
163// pair Send/Sync because `run_main` only reaches back into the JIT
164// through the cached function pointers (`entry_ptr`, `fast_entry_ptr`),
165// which are immutable after construction; the only per-call mutable
166// state is the thread-local `LLVM_ARENA_POOL`, which needs no lock.
167unsafe impl Send for JitOwned {}
168unsafe impl Sync for JitOwned {}
169
170/// Buffer schema metadata captured by `from_source`. Mirrors
171/// `relon_codegen_cranelift::evaluator::BufferSchema` — kept inside this
172/// crate (rather than re-imported) so the LLVM backend stays
173/// independent.
174struct BufferSchema {
175    main_schema: relon_eval_api::schema_canonical::Schema,
176    return_schema: relon_eval_api::schema_canonical::Schema,
177    main_layout: relon_eval_api::layout::OffsetTable,
178    return_layout: relon_eval_api::layout::OffsetTable,
179}
180
181/// Phase B LLVM AOT evaluator. Either constructed from a pre-lowered
182/// IR module via [`Self::from_ir_direct`] (legacy-i64 envelope) or
183/// from a `.relon` source via [`Self::from_source`] (buffer-protocol
184/// envelope).
185pub struct LlvmAotEvaluator {
186    jit: JitOwned,
187    entry_shape: EntryShape,
188    entry_arity: usize,
189    param_names: Vec<String>,
190    /// Buffer schema for source-driven entries; `None` for direct-IR.
191    buffer_schema: Option<BufferSchema>,
192    /// Phase D.1: when `Some`, the JIT module exported a typed
193    /// `(i64...) -> i64` fast entry alongside the buffer entry. Held
194    /// here so `run_main` can pick the fast pointer when the supplied
195    /// args match the eligible shape. Length equals the fast-entry
196    /// arity (matches `buffer_schema.main_schema.fields.len()` when
197    /// every field is `Int`).
198    ///
199    /// Stored as a bare `usize`, not `Option<usize>`: the single
200    /// resolution site in `from_ir_inner_world` assigns it together
201    /// with `fast_entry_ptr` from one tuple, so "ptr present, arity
202    /// absent" is unrepresentable by construction. Only meaningful
203    /// while `jit.fast_entry_ptr` is `Some` (it is `0` and never read
204    /// otherwise — both readers gate on the pointer first). Keeping
205    /// the per-call dispatch free of an `Option` unwrap matters: a
206    /// panicking `expect` here once pushed `run_main_legacy_i64_fast`
207    /// past the LTO inline-cost threshold, de-inlining it from bench
208    /// loops and costing the W12 kernel 2.7x per call.
209    fast_path_arity: usize,
210    /// Whether the public `run_main` method may automatically choose the
211    /// typed fast entry. The fast entry has no `ArenaState` / trap-code
212    /// channel, so bodies that can raise typed runtime traps stay
213    /// callable through `run_main_legacy_i64_fast` for benchmarks but
214    /// normal host evaluation routes through the buffer entry.
215    fast_path_auto_dispatch: bool,
216    /// Phase E.1: const-data bytes the IR's `Op::ConstString` /
217    /// `Op::ConstList*` records reference through arena-relative i32
218    /// offsets. The host copies this blob into the arena prefix at
219    /// every dispatch so the JIT-emitted `iconst(I32, offset)` lands
220    /// on the right record.
221    const_data: Vec<u8>,
222    /// Phase 0b: the module's `#native` imports in `import_idx` order.
223    /// Carried so [`Self::with_host_fns`] can match a host-supplied
224    /// `Arc<dyn RelonFunction>` (keyed by source-level name) to the
225    /// `import_idx` the lowering pass assigned.
226    native_imports: Vec<relon_ir::ir::NativeImport>,
227    /// Phase 0b: host-fn registry installed on every per-call
228    /// `ArenaState` so a source-lowered `Op::CallNative` dispatches
229    /// through `relon_llvm_call_native`. Behind an `Arc` so the
230    /// registry outlives every dispatch without per-call clones; rebuilt
231    /// by [`Self::with_host_fns`]. Empty by default — an unregistered
232    /// gated call then traps after passing the `CheckCap` gate.
233    host_fns: Arc<crate::state::HostFnRegistry>,
234    /// Phase 0b: capability bitmask passed as the buffer entry's
235    /// trailing `i64 caps` param. The source-lowered `Op::CheckCap`
236    /// gate tests bit `cap_bit` of this word; `0` denies every gated
237    /// call. Set via [`Self::with_granted_cap`] / [`Self::with_caps`].
238    caps_mask: i64,
239    /// Remaining step budget installed into each per-call
240    /// [`ArenaState`]. `0` means unlimited.
241    step_budget: AtomicI64,
242}
243
244thread_local! {
245    /// Per-thread arena buffer reused across `run_main_buffer` calls
246    /// on the same thread. The pool caches the largest `arena_size`
247    /// the thread has ever requested; subsequent dispatches reuse
248    /// the allocation and only pay a targeted `fill(0)` over the
249    /// observable prefix. Mirrors the cranelift backend's
250    /// `ARENA_POOL` to keep the dispatch boundary cost comparable.
251    static LLVM_ARENA_POOL: RefCell<Vec<u8>> = const { RefCell::new(Vec::new()) };
252}
253
254fn step_budget_to_i64(steps: Option<u64>) -> i64 {
255    match steps {
256        None => 0,
257        Some(0) => -1,
258        Some(n) => i64::try_from(n).unwrap_or(i64::MAX),
259    }
260}
261
262impl LlvmAotEvaluator {
263    /// Compile a pre-lowered IR module into a JIT-resident function
264    /// pointer. Accepts either a legacy-i64 entry
265    /// (`(I64...) -> I64`) or the buffer-protocol shape
266    /// (`(I32, I32, I32, I32, I64) -> I32`); the emitter inspects the
267    /// entry signature and picks the matching wrapper.
268    ///
269    /// `param_names` parallels the cranelift backend's
270    /// `from_ir_direct` arg so the `Evaluator::run_main` dispatch
271    /// can look up positional args by their declared name. Direct-IR
272    /// callers without a schema can pass synthetic
273    /// `["arg0", "arg1", …]` names.
274    pub fn from_ir_direct(
275        ir: relon_ir::ir::Module,
276        param_names: Vec<String>,
277    ) -> Result<Self, LlvmError> {
278        Self::from_ir_inner(ir, param_names, None)
279    }
280
281    /// Drive the full `parse → analyze → lower → emit → JIT` pipeline
282    /// against a `.relon` source. Matches the cranelift backend's
283    /// `AotEvaluator::from_source` shape so hosts can swap the two
284    /// evaluators by changing the constructor.
285    ///
286    /// Phase B accepts the IR shape `lower_workspace_single` emits
287    /// for `#main` source with the W1 / W2 production envelope
288    /// (range / map / sum). Sources outside that envelope (closures
289    /// past peephole, schema-method dispatch, stdlib calls, …) fail
290    /// at the LLVM emit step with `LlvmError::Codegen`.
291    pub fn from_source(src: &str) -> Result<Self, LlvmError> {
292        Self::from_source_with_options_inner(src, None)
293    }
294
295    /// Like [`Self::from_source`] but with caller-supplied analyzer
296    /// options — the entry point for host-registered `#native` fns.
297    /// The host populates `options.host_fn_names` /
298    /// `host_fn_signatures` / `host_fn_gates` / `caps` so the analyzer
299    /// resolves the calls, runs the single-file capability-reachability
300    /// check (a gated call without the statically-granted cap fails the
301    /// build here), and the lowering pass emits the `Op::CheckCap`-
302    /// guarded `Op::CallNative`.
303    ///
304    /// The returned evaluator carries an empty host-fn registry and a
305    /// zero capability mask; chain [`Self::with_host_fns`] +
306    /// [`Self::with_granted_cap`] to wire the runtime dispatch + grant.
307    /// Mirrors the cranelift backend's `from_source_with_options`.
308    pub fn from_source_with_options(
309        src: &str,
310        options: &relon_analyzer::AnalyzeOptions,
311    ) -> Result<Self, LlvmError> {
312        Self::from_source_with_options_inner(src, Some(options))
313    }
314
315    fn from_source_with_options_inner(
316        src: &str,
317        options: Option<&relon_analyzer::AnalyzeOptions>,
318    ) -> Result<Self, LlvmError> {
319        let (ir, main_schema, return_schema) = Self::lower_source_with_options(src, options)?;
320        let main_layout = relon_eval_api::layout::SchemaLayout::offsets_for(&main_schema)
321            .map_err(|e| LlvmError::Codegen(format!("main schema layout: {e}")))?;
322        let return_layout = relon_eval_api::layout::SchemaLayout::offsets_for(&return_schema)
323            .map_err(|e| LlvmError::Codegen(format!("return schema layout: {e}")))?;
324        let param_names: Vec<String> = main_schema.fields.iter().map(|f| f.name.clone()).collect();
325        let schema = BufferSchema {
326            main_schema,
327            return_schema,
328            main_layout,
329            return_layout,
330        };
331        Self::from_ir_inner(ir, param_names, Some(schema))
332    }
333
334    fn lower_source_with_options(
335        src: &str,
336        options: Option<&relon_analyzer::AnalyzeOptions>,
337    ) -> Result<
338        (
339            relon_ir::ir::Module,
340            relon_eval_api::schema_canonical::Schema,
341            relon_eval_api::schema_canonical::Schema,
342        ),
343        LlvmError,
344    > {
345        // W7 closure-as-value (Phase F.W7): the production source
346        // `#main(Int n) -> Dict { #internal fib: (k) => ..., result: fib(n) }`
347        // trips the v1.5 / v1.6 strict-mode type-surface diagnostics
348        // (`ClosureParamTypeMissing`, `ClosureReturnTypeUnknown`,
349        // `ExpressionTypeUnknown`) even though IR lowering accepts the
350        // shape via `lower_anon_dict_body`. Run the analyzer with
351        // `strict_mode: false` so the soft bans don't gate LLVM
352        // codegen. Hard structural errors (`UnknownTypeName`,
353        // `MainReturnTypeMismatch`, etc.) still surface as `Error`-
354        // severity diagnostics under non-strict mode and still gate the
355        // build below. Unlike the Cranelift route, the LLVM backend does
356        // NOT force `standalone_capability_check`.
357        //
358        // Phase 0b: a caller-supplied `options` (host `#native` fns)
359        // takes precedence — the host already sets `strict_mode: false`
360        // on it (see the cranelift `host_options` fixture). We force
361        // `strict_mode: false` regardless so the closure surface stays
362        // unblocked even if a host left it default-true.
363        let owned;
364        let options: &relon_analyzer::AnalyzeOptions = match options {
365            Some(o) => {
366                if o.strict_mode {
367                    owned = relon_analyzer::AnalyzeOptions {
368                        strict_mode: false,
369                        ..o.clone()
370                    };
371                    &owned
372                } else {
373                    o
374                }
375            }
376            None => {
377                owned = relon_analyzer::AnalyzeOptions {
378                    strict_mode: false,
379                    ..Default::default()
380                };
381                &owned
382            }
383        };
384        // Map the shared frontend pipeline error onto this backend's
385        // surface: Parse → Parse, Analyze(n) → Analyze(n), and Lowering
386        // → Codegen with the historical `lower_workspace_single:` prefix
387        // (the LLVM backend has no dedicated `Lowering` variant).
388        let lowered = relon_ir::frontend::compile(src, options).map_err(|e| match e {
389            relon_ir::FrontendError::Parse(msg) => LlvmError::Parse(msg),
390            relon_ir::FrontendError::Analyze(n) => LlvmError::Analyze(n),
391            relon_ir::FrontendError::Lowering(msg) => {
392                LlvmError::Codegen(format!("lower_workspace_single: {msg}"))
393            }
394        })?;
395        Ok((lowered.module, lowered.main_schema, lowered.return_schema))
396    }
397
398    /// Stage 2.⑤ closed-world source constructor. Builds the
399    /// buffer-protocol JIT evaluator with `Op::CallNative` lowered to a
400    /// direct `call @<host_symbol>`, links + inlines the host shim
401    /// bitcode, and reuses the open-world arena-handshake dispatch
402    /// (`run_main`) verbatim — the entry symbol / signature are
403    /// identical, only the native-dispatch lowering differs. No host-fn
404    /// registry / cap mask is needed at runtime: the host body is folded
405    /// into the entry by the LTO inline, so there is no dynamic
406    /// `relon_llvm_call_native` hop to resolve.
407    ///
408    /// The differential oracle for this path is the open-world
409    /// `from_source_with_options` + `run_main` result (anchored, in
410    /// turn, to cranelift's `native_call_from_source`).
411    pub fn from_source_closed_world(
412        src: &str,
413        options: &relon_analyzer::AnalyzeOptions,
414        host_shim_src: &str,
415    ) -> Result<Self, LlvmError> {
416        let (ir, main_schema, return_schema) = Self::lower_source_with_options(src, Some(options))?;
417        let main_layout = relon_eval_api::layout::SchemaLayout::offsets_for(&main_schema)
418            .map_err(|e| LlvmError::Codegen(format!("main schema layout: {e}")))?;
419        let return_layout = relon_eval_api::layout::SchemaLayout::offsets_for(&return_schema)
420            .map_err(|e| LlvmError::Codegen(format!("return schema layout: {e}")))?;
421        let param_names: Vec<String> = main_schema.fields.iter().map(|f| f.name.clone()).collect();
422        let schema = BufferSchema {
423            main_schema,
424            return_schema,
425            main_layout,
426            return_layout,
427        };
428        Self::from_ir_inner_world(
429            ir,
430            param_names,
431            Some(schema),
432            WorldMode::ClosedWorld,
433            Some(host_shim_src),
434        )
435    }
436
437    fn from_ir_inner(
438        ir: relon_ir::ir::Module,
439        param_names: Vec<String>,
440        buffer_schema: Option<BufferSchema>,
441    ) -> Result<Self, LlvmError> {
442        Self::from_ir_inner_world(ir, param_names, buffer_schema, WorldMode::OpenWorld, None)
443    }
444
445    fn from_ir_inner_world(
446        ir: relon_ir::ir::Module,
447        param_names: Vec<String>,
448        buffer_schema: Option<BufferSchema>,
449        world_mode: WorldMode,
450        host_shim_src: Option<&str>,
451    ) -> Result<Self, LlvmError> {
452        let entry_idx = ir
453            .entry_func_index
454            .ok_or_else(|| LlvmError::Codegen("IR module has no entry function".into()))?;
455        let entry = &ir.funcs[entry_idx];
456
457        // Detect the shape up-front so we can validate `param_names`
458        // against the correct envelope.
459        let buffer_shape = is_buffer_protocol_signature(&entry.params, entry.ret);
460        if !buffer_shape && entry.params.len() > MAX_LEGACY_ARITY {
461            return Err(LlvmError::UnsupportedSignature(format!(
462                "llvm-aot: {} params exceeds MAX_LEGACY_ARITY={MAX_LEGACY_ARITY}",
463                entry.params.len()
464            )));
465        }
466        let declared_arity = if buffer_shape {
467            buffer_schema
468                .as_ref()
469                .map(|s| s.main_schema.fields.len())
470                .unwrap_or(0)
471        } else {
472            entry.params.len()
473        };
474        if param_names.len() != declared_arity {
475            return Err(LlvmError::UnsupportedSignature(format!(
476                "llvm-aot: param_names len {} does not match declared arity {declared_arity}",
477                param_names.len()
478            )));
479        }
480        if buffer_shape && buffer_schema.is_none() {
481            // A direct-IR caller handed in a buffer-protocol IR
482            // without schema metadata. We can still JIT-compile,
483            // but `run_main` won't be able to pack the input or
484            // decode the output. Reject up-front so the host knows.
485            return Err(LlvmError::UnsupportedSignature(
486                "llvm-aot: buffer-protocol IR requires schema metadata; use from_source".into(),
487            ));
488        }
489        if !buffer_shape && buffer_schema.is_some() {
490            return Err(LlvmError::UnsupportedSignature(
491                "llvm-aot: schema metadata supplied for non-buffer entry".into(),
492            ));
493        }
494
495        // Build the LLVM module under a per-evaluator Context. We
496        // leak the Context onto the heap and transmute the engine's
497        // lifetime to `'static` (see SAFETY note on `JitOwned`).
498        let ctx_box: Box<Context> = Box::new(Context::create());
499        // SAFETY: `ctx_box` lives on the heap and we never deallocate
500        // it before the engine.
501        let ctx_static: &'static Context = unsafe { &*(ctx_box.as_ref() as *const Context) };
502
503        let module = ctx_static.create_module("relon_llvm_aot");
504
505        // Buffer-protocol entries return `bytes_written` as i32; under
506        // the Phase B envelope this is statically the schema's
507        // `return_layout.root_size` (no pointer-indirect StoreField
508        // bumps the tail cursor). Legacy entries ignore this value.
509        let buffer_return_size = buffer_schema
510            .as_ref()
511            .map(|s| s.return_layout.root_size as u32)
512            .unwrap_or(0);
513        // Phase E.1: build the const-data pool by walking every
514        // function body in `ir`. The blob is shipped to the host
515        // alongside the JIT engine and copied to the arena prefix at
516        // every dispatch so `Op::ConstString { idx }` resolves to a
517        // stable arena-relative offset.
518        let const_pool = ConstPool::from_module(&ir)?;
519        // Phase E.2: collect every IR sibling function (non-entry,
520        // non-lambda) so the LLVM emit pass can lower them alongside
521        // the entry. The entry's `Op::Call` lowering resolves
522        // user-defined sibling calls through the returned helper
523        // table.
524        //
525        // Phase F.W7: collect the lambdas (funcs registered in
526        // `closure_table`) separately so the emit pass can apply the
527        // widened `(state, captures_ptr, ...params) -> ret` signature
528        // and seed the closure function-pointer table. The IR's
529        // `closure_table` maps a `fn_table_idx` to an `ir.funcs`
530        // index; we mirror that order so the emit pass's
531        // `closure_fn_table[fn_table_idx]` matches what `MakeClosure`
532        // references.
533        let lambda_ir_idx_set: std::collections::HashSet<u32> =
534            ir.closure_table.iter().copied().collect();
535        let helpers: Vec<&relon_ir::ir::Func> = ir
536            .funcs
537            .iter()
538            .enumerate()
539            .filter(|(i, _)| *i != entry_idx && !lambda_ir_idx_set.contains(&(*i as u32)))
540            .map(|(_, f)| f)
541            .collect();
542        let helper_ir_indices: Vec<u32> = ir
543            .funcs
544            .iter()
545            .enumerate()
546            .filter(|(i, _)| *i != entry_idx && !lambda_ir_idx_set.contains(&(*i as u32)))
547            .map(|(i, _)| i as u32)
548            .collect();
549        let lambdas: Vec<&relon_ir::ir::Func> = ir
550            .closure_table
551            .iter()
552            .map(|&ir_idx| &ir.funcs[ir_idx as usize])
553            .collect();
554        let emit = match world_mode {
555            WorldMode::OpenWorld => emit_module_funcs,
556            WorldMode::ClosedWorld => emit_module_funcs_closed_world,
557        };
558        let (_llvm_fn, entry_shape, helper_table, closure_fn_table) = emit(
559            ctx_static,
560            &module,
561            entry,
562            buffer_return_size,
563            &const_pool,
564            &helpers,
565            Some(&helper_ir_indices),
566            &lambdas,
567            &ir.closure_table,
568            &ir.imports,
569        )?;
570
571        // Stage 2.⑤ closed-world: link + inline the host shim bitcode
572        // into the JIT module so the direct `call @<host_symbol>` sites
573        // fold into the host body during the O3 pass below. Done before
574        // the fast-entry emit so a fast entry (Int-only, no native call)
575        // is unaffected; closed-world sources always take the buffer
576        // entry because they carry an `Op::CallNative`.
577        if matches!(world_mode, WorldMode::ClosedWorld) {
578            let shim = host_shim_src.ok_or_else(|| {
579                LlvmError::Codegen(
580                    "from_ir_inner_world: ClosedWorld requires a host_shim_src".into(),
581                )
582            })?;
583            crate::cocompile::link_and_inline_host_shim(&module, shim, &ir.imports)?;
584        }
585
586        // Phase D.1 / D.2: attempt to emit the typed fast-path entry
587        // alongside the buffer entry whenever the schema qualifies.
588        // Emission failure is treated as a "no fast path available"
589        // condition rather than a hard error — the IR can stay on
590        // the buffer entry, which is correct (just slower).
591        //
592        // We discover eligibility from the `buffer_schema` (declared
593        // `#main` params + return) and the IR body. Sources that
594        // touch ops outside the fast envelope (strings, sandbox
595        // traps, etc.) fail emission inside `emit_fast_entry`; we
596        // capture the error to the IR dump for post-mortem and
597        // continue with the buffer-only module.
598        //
599        // Closure modules are stateful even when their outer schema
600        // looks like a single-Int fast shape: lambda bodies receive the
601        // real `ArenaState` so they can read captures from the arena and
602        // participate in bounds/trap semantics. The typed fast entry has
603        // no state pointer, so keep it off for any closure table entry.
604        // The wasm/object path already applies this same routing rule.
605        let fast_profile = buffer_schema
606            .as_ref()
607            .filter(|_| ir.closure_table.is_empty())
608            .and_then(|s| build_fast_path_profile(s).ok());
609        let fast_path_auto_dispatch = !body_may_raise_typed_trap(&entry.body);
610        let mut fast_emit_diagnostic: Option<String> = None;
611        if let Some(profile) = fast_profile.as_ref() {
612            match emit_fast_entry(
613                ctx_static,
614                &module,
615                entry,
616                profile,
617                &helper_table,
618                &closure_fn_table,
619            ) {
620                Ok(_) => {}
621                Err(e) => {
622                    fast_emit_diagnostic = Some(format!("{e}"));
623                    // Roll back the partially-emitted fast entry so
624                    // the module verifies cleanly with just the
625                    // buffer entry. inkwell's `delete` is unsafe
626                    // because it invalidates any outstanding
627                    // `FunctionValue` handle; the emitter dropped
628                    // its handle when `emit_fast_entry` returned.
629                    if let Some(f) = module.get_function(ENTRY_SYMBOL_FAST) {
630                        unsafe { f.delete() };
631                    }
632                }
633            }
634        }
635
636        module
637            .verify()
638            .map_err(|e| LlvmError::Codegen(format!("LLVM verifier rejected module: {e}")))?;
639
640        // Pin every function to the RUNTIME host CPU before MCJIT
641        // codegen. The MCJIT engine builders take no MCPU, so without
642        // this the X86 backend lowers for generic x86-64 and drops the
643        // host `SlowDivide64` narrowing — every i64 `%` / `/` becomes a
644        // bare microcoded `idivq` instead of the host `shrq $32; je;
645        // divl` fast path. The O3 pipeline and the static object-emit
646        // path already target the host; this brings the JIT backend in
647        // line. Stamping `target-cpu` / `target-features` (host-queried,
648        // never hard-coded) is the lever inkwell 0.9 / MCJIT exposes.
649        // Results are byte-identical to the generic lowering — this is a
650        // codegen-quality / instruction-selection fix, not a semantics
651        // change.
652        stamp_host_target_attributes(&module);
653
654        // Run LLVM's `-O3` middle-end pipeline on the module before
655        // handing it to MCJIT. MCJIT's `OptimizationLevel::Aggressive`
656        // controls backend codegen optimizations (regalloc, instr
657        // selection) but does **not** invoke the IR-level passes —
658        // `mem2reg`, `instcombine`, `gvn`, `licm`, loop-unroll,
659        // SLP-vectorize, etc. live in the middle-end pipeline. Without
660        // them the emitted IR's alloca-heavy stack-machine lowering
661        // hits the assembler unsimplified, leaving a 100×+ gap vs the
662        // equivalent native Rust hot loop.
663        //
664        // The pipeline is built fresh through `PassBuilderOptions`
665        // (the LLVM 17+ new pass manager) since inkwell 0.9 deprecates
666        // the legacy `PassManager` for IR-level work on LLVM 18.
667        // Debug: capture pre-opt IR if the env requests it via
668        // `RELON_LLVM_DUMP_PREOPT=1`. The pre-opt shape is mostly
669        // alloca / load / store noise but is useful when verifying
670        // that emitter changes survived the dispatch path (post-opt
671        // IR can have aggressive constant folding that makes brand-
672        // new branches invisible). The flag is intentionally opt-in
673        // so production paths never pay the second IR dump.
674        let preopt_dump: Option<String> = std::env::var_os("RELON_LLVM_DUMP_PREOPT")
675            .map(|_| module.print_to_string().to_string());
676
677        run_default_o3_pipeline(&module)?;
678
679        // Capture the dumped IR *after* the optimizer ran so tests
680        // that assert on the IR see the post-opt shape (mem2reg /
681        // loop simplification visible). The pre-opt shape is mostly
682        // alloca / load / store noise.
683        let mut ir_dump = module.print_to_string().to_string();
684        if let Some(p) = preopt_dump {
685            ir_dump = format!("; --- PRE-OPT IR ---\n{p}\n; --- POST-OPT IR ---\n{ir_dump}");
686        }
687
688        // Phase L profile-first: dump post-O3 IR + host-targeted ASM
689        // to `$RELON_LLVM_DUMP_DIR/` when the env var is set. The dump
690        // mirrors the actual MCJIT codegen path (same TargetMachine
691        // knobs as `run_default_o3_pipeline`) so the .s file matches
692        // what the JIT engine actually emits at JIT-resolve time.
693        if let Some(dir) = std::env::var_os("RELON_LLVM_DUMP_DIR") {
694            let dir = std::path::PathBuf::from(dir);
695            let _ = std::fs::create_dir_all(&dir);
696            let _ = std::fs::write(dir.join("module.post_o3.ll"), &ir_dump);
697            // Re-create a TargetMachine matching the JIT path so the
698            // dumped ASM is byte-equivalent to what MCJIT codegen
699            // hands to the loader. The codegen-side OptLevel for MCJIT
700            // is `Aggressive` (see `create_jit_execution_engine` call
701            // below); mirror that here.
702            if let Ok(()) = Target::initialize_native(&InitializationConfig::default()) {
703                let triple_str = TargetMachine::get_default_triple();
704                if let Ok(target) = Target::from_triple(&triple_str) {
705                    let cpu = TargetMachine::get_host_cpu_name();
706                    let features = TargetMachine::get_host_cpu_features();
707                    if let Ok(triple_utf8) = triple_str.as_str().to_str() {
708                        let triple = TargetTriple::create(triple_utf8);
709                        if let Some(machine) = target.create_target_machine(
710                            &triple,
711                            cpu.to_str().unwrap_or(""),
712                            features.to_str().unwrap_or(""),
713                            OptimizationLevel::Aggressive,
714                            RelocMode::Default,
715                            CodeModel::JITDefault,
716                        ) {
717                            let _ = machine.write_to_file(
718                                &module,
719                                FileType::Assembly,
720                                &dir.join("module.s"),
721                            );
722                            let _ = machine.write_to_file(
723                                &module,
724                                FileType::Object,
725                                &dir.join("module.o"),
726                            );
727                        }
728                        // Dump variant: CodeModel::Small + RelocMode::PIC
729                        // so we can A/B with `module.s` and see whether the
730                        // recursive call shrinks to a PC-rel `callq <sym>`.
731                        if let Some(machine) = target.create_target_machine(
732                            &triple,
733                            cpu.to_str().unwrap_or(""),
734                            features.to_str().unwrap_or(""),
735                            OptimizationLevel::Aggressive,
736                            RelocMode::PIC,
737                            CodeModel::Small,
738                        ) {
739                            let _ = machine.write_to_file(
740                                &module,
741                                FileType::Assembly,
742                                &dir.join("module.small_pic.s"),
743                            );
744                        }
745                        // Dump variant: CodeModel::Small + RelocMode::Static.
746                        if let Some(machine) = target.create_target_machine(
747                            &triple,
748                            cpu.to_str().unwrap_or(""),
749                            features.to_str().unwrap_or(""),
750                            OptimizationLevel::Aggressive,
751                            RelocMode::Static,
752                            CodeModel::Small,
753                        ) {
754                            let _ = machine.write_to_file(
755                                &module,
756                                FileType::Assembly,
757                                &dir.join("module.small_static.s"),
758                            );
759                        }
760                    }
761                }
762            }
763        }
764
765        // Phase L codegen-quality: pick the MCJIT engine builder by
766        // whether the module references the host-side `contains` shim.
767        //
768        // - **No extern** -> use the custom memory manager + Small
769        //   CodeModel. All same-module calls collapse to direct
770        //   `callq <pcrel32>` instead of MCJIT's default
771        //   `movabsq + callq *%reg` (Large CodeModel). For tight
772        //   recursive bodies like W7 fib this saves ~0.2 ns / call
773        //   on Intel; multiplied by fib(22)'s ~35 k call tree it
774        //   closes ~10 µs of the gap vs the rustc LTO build.
775        //
776        // - **Extern present** -> stay on the default JIT builder
777        //   (Large CodeModel) because the host-side shim lives in
778        //   the executable's `.text` which is typically > 2 GB away
779        //   from the JIT's freshly-mmap'd code arena. A 32-bit
780        //   PC-relative relocation would fail to resolve; the Large
781        //   CodeModel's `movabsq + indirect` pattern handles it.
782        //
783        // Detection is purely structural — we look up the shim
784        // symbol on the module. The emitter declares it lazily, so
785        // its presence means "this module has at least one extern
786        // call site that needs `add_global_mapping` after engine
787        // creation".
788        // Phase 0b: the native-dispatch helper is also a host-resident
789        // extern (it lives in this crate's `.text`, not the JIT arena),
790        // so a module that references it must stay on the default JIT
791        // builder (Large CodeModel) for the same ±2 GB-relocation reason
792        // the `str.contains` shim does.
793        let uses_extern_shim = module
794            .get_function(crate::str_helpers::RELON_LLVM_STR_CONTAINS_ARENA_SYMBOL)
795            .is_some()
796            || module
797                .get_function(crate::str_helpers::RELON_LLVM_F64_TO_STR_SYMBOL)
798                .is_some()
799            || module
800                .get_function(crate::state::RELON_LLVM_CALL_NATIVE_SYMBOL)
801                .is_some();
802        let force_default_mcjit = std::env::var_os("RELON_LLVM_FORCE_DEFAULT_MCJIT").is_some();
803        let engine = if uses_extern_shim || force_default_mcjit {
804            module
805                .create_jit_execution_engine(OptimizationLevel::Aggressive)
806                .map_err(|e| LlvmError::Codegen(format!("create_jit_execution_engine: {e}")))?
807        } else {
808            let mm = crate::mcjit_mm::ContiguousCodeMemoryManager::new();
809            module
810                .create_mcjit_execution_engine_with_memory_manager(
811                    mm,
812                    OptimizationLevel::Aggressive,
813                    inkwell::targets::CodeModel::Small,
814                    /*no_frame_pointer_elim=*/ false,
815                    /*enable_fast_isel=*/ false,
816                )
817                .map_err(|e| {
818                    LlvmError::Codegen(format!(
819                        "create_mcjit_execution_engine_with_memory_manager (Small CodeModel): {e}"
820                    ))
821                })?
822        };
823
824        // Phase F.1: wire the host shim that backs the LLVM AOT
825        // `contains(haystack, needle) -> Bool` fast path. The emitter
826        // declares this symbol with `Linkage::External` whenever a
827        // module references it; MCJIT needs an explicit address
828        // mapping because the default resolver (`dlsym`) cannot see
829        // statics from inside the current dylib's strip-able section
830        // layout. We register unconditionally — if the module never
831        // referenced the symbol the mapping is a no-op.
832        if let Some(shim_fn) =
833            module.get_function(crate::str_helpers::RELON_LLVM_STR_CONTAINS_ARENA_SYMBOL)
834        {
835            engine.add_global_mapping(
836                &shim_fn,
837                crate::str_helpers::relon_llvm_str_contains_arena_addr(),
838            );
839        }
840
841        // Wave B: same constraint for the float-render shim — the
842        // `Op::FloatToStr` lowering declares `relon_llvm_f64_to_str`
843        // as an external function whose body lives in this dylib's
844        // `.text`. No-op when the module never rendered a Float.
845        if let Some(shim_fn) = module.get_function(crate::str_helpers::RELON_LLVM_F64_TO_STR_SYMBOL)
846        {
847            engine.add_global_mapping(&shim_fn, crate::str_helpers::relon_llvm_f64_to_str_addr());
848        }
849
850        // Phase 0b: map the native-dispatch helper symbol to its host
851        // address so an emitted `call @relon_llvm_call_native` resolves.
852        // The default MCJIT resolver (`dlsym`) cannot see the static
853        // from inside this dylib's section layout — same constraint as
854        // the `str.contains` shim. No-op when the module never emitted
855        // a `CallNative` (the symbol is absent).
856        if let Some(cn_fn) = module.get_function(crate::state::RELON_LLVM_CALL_NATIVE_SYMBOL) {
857            engine.add_global_mapping(&cn_fn, crate::state::relon_llvm_call_native_addr());
858        }
859
860        let entry_ptr = engine.get_function_address(ENTRY_SYMBOL).map_err(|e| {
861            LlvmError::Codegen(format!(
862                "ExecutionEngine could not resolve `{ENTRY_SYMBOL}`: {e}"
863            ))
864        })?;
865
866        // Phase D.1: resolve the typed fast-entry pointer when the
867        // module exported one. Resolution failure here is *not* an
868        // emit-side bug — the symbol simply wasn't emitted (or was
869        // rolled back) — so we treat it as "no fast path" silently.
870        //
871        // Pairing invariant: this is the *only* assignment of the
872        // `fast_entry_ptr`/`fast_path_arity` pair. Both arms set the
873        // two together, so a live pointer always carries the profile's
874        // real arity and the `(Some ptr, missing arity)` state cannot
875        // exist — the hot dispatch reads the arity without any
876        // `Option` check or panic path.
877        let (fast_entry_ptr, fast_path_arity) = match (&fast_profile, &fast_emit_diagnostic) {
878            (Some(profile), None) => match engine.get_function_address(ENTRY_SYMBOL_FAST) {
879                Ok(ptr) => (Some(ptr), profile.arg_offsets.len()),
880                Err(_) => (None, 0),
881            },
882            _ => (None, 0),
883        };
884        // Stash the fast-emit diagnostic (if any) into the IR dump so
885        // post-mortem tests can assert on it without needing a
886        // dedicated getter. The dump is only consumed by tests so the
887        // overhead doesn't matter at runtime.
888        let ir_dump = match fast_emit_diagnostic {
889            Some(diag) => format!("; fast-emit diagnostic: {diag}\n{ir_dump}"),
890            None => ir_dump,
891        };
892
893        Ok(Self {
894            jit: JitOwned {
895                _engine: engine,
896                entry_ptr,
897                fast_entry_ptr,
898                ir_dump,
899                _ctx: ctx_box,
900            },
901            entry_shape,
902            entry_arity: entry.params.len(),
903            param_names,
904            buffer_schema,
905            fast_path_arity,
906            fast_path_auto_dispatch,
907            const_data: const_pool.bytes,
908            native_imports: ir.imports.clone(),
909            host_fns: Arc::new(crate::state::HostFnRegistry::new()),
910            caps_mask: 0,
911            step_budget: AtomicI64::new(0),
912        })
913    }
914
915    /// Number of `#main` arguments expected. Under the buffer-protocol
916    /// shape this is the count of declared `#main(...)` params (from
917    /// the source schema), not the entry function's IR arity (which
918    /// is always 5 for buffer protocol). Under the legacy-i64 shape
919    /// the two coincide.
920    pub fn arity(&self) -> usize {
921        self.param_names.len()
922    }
923
924    /// Names of the declared `#main` parameters in declaration order.
925    pub fn param_names(&self) -> &[String] {
926        &self.param_names
927    }
928
929    /// Phase 0b: the `#native` imports the lowering pass interned for
930    /// this module, in `import_idx` order. Lets a host map fn names to
931    /// the slots [`Self::with_host_fns`] fills. Mirrors the cranelift
932    /// backend's `native_imports`.
933    pub fn native_imports(&self) -> &[relon_ir::ir::NativeImport] {
934        &self.native_imports
935    }
936
937    /// Phase 0b: register the host's `Arc<dyn RelonFunction>` callables
938    /// for source-lowered native-fn dispatch. Each entry is keyed by the
939    /// source-level fn name; this matches the name to the `import_idx`
940    /// the lowering pass assigned (via [`Self::native_imports`]) and
941    /// installs the callable in the evaluator's `import_idx`-keyed
942    /// registry. A source-lowered `Op::CallNative` then dispatches to it
943    /// through the `relon_llvm_call_native` helper. Names with no
944    /// matching `#native` import are skipped. Mirrors the cranelift
945    /// backend's `with_host_fns`.
946    ///
947    /// The capability *guard* is enforced independently by the
948    /// `Op::CheckCap` prologue against the granted `caps` mask
949    /// ([`Self::with_granted_cap`]) — registering a callable does not
950    /// grant its capability.
951    pub fn with_host_fns(
952        mut self,
953        host_fns: &std::collections::HashMap<String, Arc<dyn relon_eval_api::RelonFunction>>,
954    ) -> Self {
955        let mut registry = crate::state::HostFnRegistry::new();
956        for (idx, imp) in self.native_imports.iter().enumerate() {
957            if let Some(func) = host_fns.get(&imp.name) {
958                registry.register(idx as u32, Arc::clone(func));
959            }
960        }
961        self.host_fns = Arc::new(registry);
962        self
963    }
964
965    /// Phase 0b: grant a capability bit so the source-lowered
966    /// `Op::CheckCap` prologue passes at runtime. Sets bit `bit` in the
967    /// `caps` bitmask the buffer entry receives as its trailing `i64`
968    /// param. Decoupled from the analyze-time `caps`: a host can grant
969    /// statically (build passes the reachability check) yet withhold
970    /// here to exercise a stricter runtime posture (the gated call then
971    /// traps `CapabilityDenied`). Mirrors the cranelift backend's
972    /// `with_granted_cap` outcome class.
973    pub fn with_granted_cap(mut self, bit: u32) -> Self {
974        if bit < 64 {
975            self.caps_mask |= 1i64 << bit;
976        }
977        self
978    }
979
980    /// Phase 0b: set the full `caps` bitmask wholesale (the trailing
981    /// `i64` param the buffer entry's `Op::CheckCap` gate tests).
982    /// Companion to [`Self::with_granted_cap`] for hosts that already
983    /// hold a packed mask.
984    pub fn with_caps(mut self, caps_mask: i64) -> Self {
985        self.caps_mask = caps_mask;
986        self
987    }
988
989    /// Configure the LLVM buffer-entry step budget. `None` disables
990    /// the budget. `Some(n)` permits `n` entry/loop budget ticks before
991    /// the JIT records `ResourceExhausted` and the host lifts it to
992    /// `RuntimeError::StepLimitExceeded`.
993    pub fn set_step_budget(&self, steps: Option<u64>) {
994        self.step_budget
995            .store(step_budget_to_i64(steps), Ordering::Relaxed);
996    }
997
998    /// Builder-style companion to [`Self::set_step_budget`].
999    pub fn with_step_budget(self, steps: Option<u64>) -> Self {
1000        self.set_step_budget(steps);
1001        self
1002    }
1003
1004    /// Fast-path entry mirroring `AotEvaluator::run_main_legacy_i64`:
1005    /// skip the HashMap pack and invoke the JIT entry with a slice of
1006    /// positional i64 args. Only valid for the legacy-i64 entry shape.
1007    pub fn run_main_legacy_i64(&self, args: &[i64]) -> Result<i64, RuntimeError> {
1008        if self.entry_shape != EntryShape::LegacyI64 {
1009            return Err(RuntimeError::Unsupported {
1010                reason: "llvm-aot: run_main_legacy_i64 called on buffer-protocol entry".into(),
1011            });
1012        }
1013        if args.len() != self.entry_arity {
1014            return Err(RuntimeError::Unsupported {
1015                reason: format!(
1016                    "llvm-aot: #main expects {} arg(s), got {}",
1017                    self.entry_arity,
1018                    args.len()
1019                ),
1020            });
1021        }
1022        let ptr = self.jit.entry_ptr;
1023        // SAFETY: see Phase A `run_main_legacy_i64` for the same
1024        // transmute-and-call pattern. The cached `entry_ptr` was
1025        // returned by `ExecutionEngine::get_function_address` at
1026        // construction time and stays valid for the engine's
1027        // lifetime.
1028        unsafe {
1029            match self.entry_arity {
1030                0 => {
1031                    let f: LegacyEntryFn0 = std::mem::transmute(ptr);
1032                    Ok(f())
1033                }
1034                1 => {
1035                    let f: LegacyEntryFn1 = std::mem::transmute(ptr);
1036                    Ok(f(args[0]))
1037                }
1038                2 => {
1039                    let f: LegacyEntryFn2 = std::mem::transmute(ptr);
1040                    Ok(f(args[0], args[1]))
1041                }
1042                3 => {
1043                    let f: LegacyEntryFn3 = std::mem::transmute(ptr);
1044                    Ok(f(args[0], args[1], args[2]))
1045                }
1046                4 => {
1047                    let f: LegacyEntryFn4 = std::mem::transmute(ptr);
1048                    Ok(f(args[0], args[1], args[2], args[3]))
1049                }
1050                n => Err(RuntimeError::Unsupported {
1051                    reason: format!("llvm-aot: arity {n} > MAX_LEGACY_ARITY={MAX_LEGACY_ARITY}"),
1052                }),
1053            }
1054        }
1055    }
1056
1057    /// Print the emitted LLVM IR. Useful for tests / benchmarks that
1058    /// want to assert against the lowering output without leaving
1059    /// the test binary.
1060    pub fn emit_ir_dump(&self) -> &str {
1061        &self.jit.ir_dump
1062    }
1063
1064    /// Phase D.1: does this evaluator have a JIT-resident fast entry
1065    /// the host can dispatch through when args are all-Int? Exposed
1066    /// for the smoke tests that assert the fast path is wired up;
1067    /// benches use it to log which row hit the fast vs buffer path.
1068    pub fn has_fast_path(&self) -> bool {
1069        self.jit.fast_entry_ptr.is_some()
1070    }
1071
1072    /// Phase D.1: arity of the typed fast entry, when one was emitted.
1073    /// Matches `arity()` for source-driven entries that qualify; `None`
1074    /// when the source falls back to the buffer-only path.
1075    pub fn fast_path_arity(&self) -> Option<usize> {
1076        self.jit.fast_entry_ptr.map(|_| self.fast_path_arity)
1077    }
1078
1079    /// Phase L codegen-quality debug helper: raw address of the typed
1080    /// fast-entry function in the JIT-allocated code arena. Returns
1081    /// `None` if the source falls back to the buffer entry. Hosts use
1082    /// this to disassemble the MCJIT-produced machine code at runtime
1083    /// (`xxd` / `objdump --disassemble-all` on a byte slice) — useful
1084    /// for confirming whether the engine emitted direct `callq <pcrel>`
1085    /// vs the Large-CodeModel `movabsq + callq *%reg` shape.
1086    pub fn fast_entry_runtime_addr(&self) -> Option<usize> {
1087        self.jit.fast_entry_ptr
1088    }
1089
1090    /// Phase L codegen-quality debug helper: raw address of the
1091    /// buffer-protocol entry function in the JIT-allocated code arena.
1092    /// Always populated for a successful `from_source` build.
1093    pub fn entry_runtime_addr(&self) -> usize {
1094        self.jit.entry_ptr
1095    }
1096
1097    /// The running host's LLVM CPU name (e.g. `broadwell`, `znver3`),
1098    /// as queried by `TargetMachine::get_host_cpu_name`. This is the
1099    /// exact value stamped as the `"target-cpu"` function attribute on
1100    /// every JIT'd function so the MCJIT backend lowers for the CPU it
1101    /// runs on (and emits the host idiv-narrowing fast path rather than
1102    /// a generic bare `idivq`). Exposed so capability tests can confirm
1103    /// the stamp is the runtime host, never a hard-coded literal.
1104    pub fn host_target_cpu() -> String {
1105        TargetMachine::get_host_cpu_name()
1106            .to_str()
1107            .unwrap_or("")
1108            .to_string()
1109    }
1110
1111    /// Phase D.1 dispatch-boundary fast path: invoke the typed fast
1112    /// entry directly with positional `i64` args. Bypasses the
1113    /// `HashMap` pack, `BufferBuilder` writes, arena setup, and
1114    /// `BufferReader` decode entirely — the call chain is
1115    /// `Rust caller → cached fn pointer → JIT body → i64 return`.
1116    ///
1117    /// Returns `Err(Unsupported)` when the evaluator was built without
1118    /// a fast entry (source past the Int-only envelope, or
1119    /// constructed via `from_ir_direct`).
1120    pub fn run_main_legacy_i64_fast(&self, args: &[i64]) -> Result<i64, RuntimeError> {
1121        let ptr = self
1122            .jit
1123            .fast_entry_ptr
1124            .ok_or_else(|| RuntimeError::Unsupported {
1125                reason:
1126                    "llvm-aot: fast entry not available; source not Int-only or fast-emit failed"
1127                        .into(),
1128            })?;
1129        // Pairing invariant (single assignment site in
1130        // `from_ir_inner_world`): `fast_path_arity` is always set
1131        // together with `fast_entry_ptr`, so a live pointer means the
1132        // bare-`usize` arity is the profile's real value — no `Option`
1133        // unwrap, no panic landing pad on the per-call path. (An
1134        // `expect` here once de-inlined this function under fat LTO
1135        // and regressed the W12 kernel from 3.55ns to 9.46ns/call.)
1136        let arity = self.fast_path_arity;
1137        if args.len() != arity {
1138            return Err(RuntimeError::Unsupported {
1139                reason: format!(
1140                    "llvm-aot fast path: #main expects {arity} arg(s), got {}",
1141                    args.len()
1142                ),
1143            });
1144        }
1145        // SAFETY: the cached pointer came back from
1146        // `ExecutionEngine::get_function_address(ENTRY_SYMBOL_FAST)`
1147        // which guarantees the symbol is live for the engine's
1148        // lifetime. The arity-specialised dispatch table mirrors the
1149        // typed signature `emit_fast_entry` produced for this
1150        // function shape.
1151        unsafe {
1152            let r = match arity {
1153                0 => {
1154                    let f: FastEntryFn0 = std::mem::transmute(ptr);
1155                    f()
1156                }
1157                1 => {
1158                    let f: FastEntryFn1 = std::mem::transmute(ptr);
1159                    f(args[0])
1160                }
1161                2 => {
1162                    let f: FastEntryFn2 = std::mem::transmute(ptr);
1163                    f(args[0], args[1])
1164                }
1165                3 => {
1166                    let f: FastEntryFn3 = std::mem::transmute(ptr);
1167                    f(args[0], args[1], args[2])
1168                }
1169                4 => {
1170                    let f: FastEntryFn4 = std::mem::transmute(ptr);
1171                    f(args[0], args[1], args[2], args[3])
1172                }
1173                5 => {
1174                    let f: FastEntryFn5 = std::mem::transmute(ptr);
1175                    f(args[0], args[1], args[2], args[3], args[4])
1176                }
1177                6 => {
1178                    let f: FastEntryFn6 = std::mem::transmute(ptr);
1179                    f(args[0], args[1], args[2], args[3], args[4], args[5])
1180                }
1181                7 => {
1182                    let f: FastEntryFn7 = std::mem::transmute(ptr);
1183                    f(
1184                        args[0], args[1], args[2], args[3], args[4], args[5], args[6],
1185                    )
1186                }
1187                8 => {
1188                    let f: FastEntryFn8 = std::mem::transmute(ptr);
1189                    f(
1190                        args[0], args[1], args[2], args[3], args[4], args[5], args[6], args[7],
1191                    )
1192                }
1193                n => {
1194                    return Err(RuntimeError::Unsupported {
1195                        reason: format!("llvm-aot fast path: arity {n} > 8 dispatch cap"),
1196                    });
1197                }
1198            };
1199            Ok(r)
1200        }
1201    }
1202
1203    /// Try the fast path first: when the schema qualifies and every
1204    /// supplied arg is `Int`, dispatch through the typed JIT entry
1205    /// and wrap the i64 result. Returns `Ok(None)` when the fast
1206    /// path isn't applicable for this call (caller falls back to the
1207    /// buffer entry). `Ok(Some(v))` on a successful fast dispatch;
1208    /// `Err` only when the dispatch itself failed.
1209    fn try_run_main_fast(
1210        &self,
1211        args: &HashMap<String, Value>,
1212    ) -> Result<Option<Value>, RuntimeError> {
1213        if self.jit.fast_entry_ptr.is_none() {
1214            return Ok(None);
1215        }
1216        if !self.fast_path_auto_dispatch {
1217            return Ok(None);
1218        }
1219        // Pairing invariant: `fast_path_arity` is assigned together
1220        // with `fast_entry_ptr` at the single resolution site, so
1221        // reaching here (entry ptr is Some) means the bare-`usize`
1222        // arity is live — no `Option` unwrap on the dispatch path.
1223        let arity = self.fast_path_arity;
1224        if arity != self.param_names.len() {
1225            // Schema arity mismatch — shouldn't happen if
1226            // `build_fast_path_profile` agreed, but be defensive.
1227            return Ok(None);
1228        }
1229        let mut argv = [0i64; 8];
1230        for (i, name) in self.param_names.iter().enumerate() {
1231            match args.get(name) {
1232                Some(Value::Int(v)) => argv[i] = *v,
1233                _ => return Ok(None), // missing or non-Int arg → fall back
1234            }
1235        }
1236        let r = self.run_main_legacy_i64_fast(&argv[..arity])?;
1237        // Phase D.2: re-wrap the i64 result to match the buffer
1238        // path's `Value` shape. The fast-path profile gate accepts
1239        // both the canonical `Ret { value: Int }` wrapper (Phase
1240        // D.1 — surfaces as bare `Value::Int`) and any user-declared
1241        // anon-record return collapsed to a single Int field (Phase
1242        // D.2 — surfaces as `Value::Dict { <field_name>: Int }` to
1243        // match `run_main_buffer`'s `read_record_into_map` decode).
1244        // `is_single_value_wrapper` discriminates the two — strict
1245        // canonical name match → bare scalar; otherwise → branded
1246        // dict.
1247        if let Some(schema) = self.buffer_schema.as_ref() {
1248            if is_single_value_wrapper(&schema.return_schema) {
1249                Ok(Some(Value::Int(r)))
1250            } else {
1251                let field_name = schema.return_schema.fields[0].name.clone();
1252                let mut map: HashMap<String, Value> = HashMap::with_capacity(1);
1253                map.insert(field_name, Value::Int(r));
1254                Ok(Some(Value::branded_dict(
1255                    map,
1256                    Some(schema.return_schema.name.clone()),
1257                )))
1258            }
1259        } else {
1260            Ok(Some(Value::Int(r)))
1261        }
1262    }
1263
1264    /// Buffer-protocol `run_main`: pack the HashMap-keyed args into
1265    /// an arena, invoke the JIT, decode the return record.
1266    fn run_main_buffer(&self, args: HashMap<String, Value>) -> Result<Value, RuntimeError> {
1267        let schema = self
1268            .buffer_schema
1269            .as_ref()
1270            .ok_or_else(|| RuntimeError::Unsupported {
1271                reason: "llvm-aot: run_main_buffer called without schema metadata".into(),
1272            })?;
1273
1274        // 1. Pack the args into a buffer using `BufferBuilder`.
1275        let mut builder = relon_eval_api::buffer::BufferBuilder::new(
1276            &schema.main_layout,
1277            &schema.main_schema.fields,
1278        );
1279        for field in &schema.main_schema.fields {
1280            let value = args
1281                .get(&field.name)
1282                .ok_or_else(|| RuntimeError::Unsupported {
1283                    reason: format!("llvm-aot: missing #main arg `{}`", field.name),
1284                })?;
1285            write_value_into_builder(&mut builder, field, value)?;
1286        }
1287        // F1: bake `in_ptr` into every input pointer slot (arena-absolute
1288        // convention), so the JIT body's param reads drop their `+ in_ptr`
1289        // rebase. `in_ptr` depends only on const-data length.
1290        let in_ptr_pre = relon_util::align_up(
1291            u32::try_from(self.const_data.len()).map_err(|_| {
1292                RuntimeError::IoError("llvm const-data section exceeds u32 range".into())
1293            })?,
1294            8,
1295        );
1296        let in_bytes = builder
1297            .finish_arena_absolute(in_ptr_pre)
1298            .map_err(buffer_to_runtime_error)?;
1299
1300        // 2. Lay out the arena. Phase E.1 widens the layout to match
1301        // the cranelift backend: `[const_data | pad | in_buf | pad |
1302        // out_buf (root + tail cap) | pad | scratch]`. The const-data
1303        // pool lives at offset 0; ConstString-emitted offsets point
1304        // directly at the records inside it. The scratch region at
1305        // the tail backs the bump allocator (`AllocScratchDyn`).
1306        let in_len = in_bytes.len() as u32;
1307        let out_root_size = schema.return_layout.root_size as u32;
1308        // For String / List return types we reserve a chunky tail-
1309        // cursor region so pointer-indirect StoreField can stamp the
1310        // payload past the fixed-area slot without re-allocating on
1311        // every dispatch.
1312        let needs_pointer_indirect_return = return_needs_tail_region(&schema.return_schema);
1313        // Cap the output region:
1314        //   * fixed area: max(root_size, 8) padded to 8.
1315        //   * tail area: 64 KiB cushion for String returns (W3 hits
1316        //     ~3 KiB per dispatch at STRING_CONCAT_N = 3 000; a 64 KiB
1317        //     reservation keeps the bump path away from arena edges
1318        //     without ballooning the allocation).
1319        let tail_cap: u32 = if needs_pointer_indirect_return {
1320            65_536
1321        } else {
1322            0
1323        };
1324        let out_cap = relon_util::align_up(out_root_size.max(8) + tail_cap + 16, 8);
1325        let const_data_len = u32::try_from(self.const_data.len()).map_err(|_| {
1326            RuntimeError::IoError("llvm const-data section exceeds u32 range".into())
1327        })?;
1328        let in_ptr = relon_util::align_up(const_data_len, 8);
1329        let out_ptr = relon_util::align_up(in_ptr + in_len, 8);
1330        let scratch_base = relon_util::align_up(out_ptr + out_cap, 8);
1331        // Scratch region size: 64 KiB matches the cranelift backend's
1332        // figure; the W3 hot-loop concat writes ~3*N bytes total but
1333        // the scratch cursor never resets within a dispatch (each
1334        // iteration's intermediate string sticks around until
1335        // run-end) so we need enough headroom for the worst-case
1336        // W3-style `O(N^2)` allocation pattern.
1337        let scratch_size: u32 = 1_048_576; // 1 MiB
1338        let arena_size = (scratch_base + scratch_size) as usize;
1339
1340        // 3. Acquire the per-thread arena buffer, install the
1341        // input bytes, dispatch. Reentrant calls (a stdlib helper
1342        // looping back through the evaluator on the same thread)
1343        // fall back to a fresh `Vec<u8>` — correctness wins over
1344        // pool reuse on the vanishingly rare path.
1345        LLVM_ARENA_POOL.with(|cell| match cell.try_borrow_mut() {
1346            Ok(mut buf) => self.dispatch_with_arena(
1347                schema,
1348                &mut buf,
1349                arena_size,
1350                in_ptr,
1351                in_len,
1352                out_ptr,
1353                out_cap,
1354                scratch_base,
1355                &in_bytes,
1356            ),
1357            Err(_) => {
1358                let mut fallback: Vec<u8> = Vec::new();
1359                self.dispatch_with_arena(
1360                    schema,
1361                    &mut fallback,
1362                    arena_size,
1363                    in_ptr,
1364                    in_len,
1365                    out_ptr,
1366                    out_cap,
1367                    scratch_base,
1368                    &in_bytes,
1369                )
1370            }
1371        })
1372    }
1373
1374    /// Inner driver shared by the pooled-arena and fallback-arena
1375    /// branches of [`Self::run_main_buffer`]. Resizes `arena` to
1376    /// `arena_size`, copies `in_bytes` into the input region,
1377    /// invokes the JIT, then decodes the output region.
1378    #[allow(clippy::too_many_arguments)]
1379    fn dispatch_with_arena(
1380        &self,
1381        schema: &BufferSchema,
1382        arena: &mut Vec<u8>,
1383        arena_size: usize,
1384        in_ptr: u32,
1385        in_len: u32,
1386        out_ptr: u32,
1387        out_cap: u32,
1388        scratch_base: u32,
1389        in_bytes: &[u8],
1390    ) -> Result<Value, RuntimeError> {
1391        if arena.len() < arena_size {
1392            arena.resize(arena_size, 0);
1393        }
1394        // Zero only the region the JIT can observe before writing —
1395        // const_data is overwritten in full, in_bytes are copied on
1396        // top of the input area, the out region must read as zero
1397        // because pointer-indirect StoreField bumps into a
1398        // freshly-zero tail cursor, and the scratch tail is written
1399        // before being read by the JIT itself.
1400        let observable_end = (out_ptr + out_cap) as usize;
1401        debug_assert!(observable_end <= arena_size);
1402        debug_assert!(self.const_data.len() <= in_ptr as usize);
1403        arena[self.const_data.len()..observable_end].fill(0);
1404        if !self.const_data.is_empty() {
1405            arena[..self.const_data.len()].copy_from_slice(&self.const_data);
1406        }
1407        arena[in_ptr as usize..in_ptr as usize + in_bytes.len()].copy_from_slice(in_bytes);
1408
1409        let live_arena = &mut arena[..arena_size];
1410        let state = ArenaState::new(live_arena, scratch_base);
1411        state.set_step_budget(self.step_budget.load(Ordering::Relaxed));
1412        // Phase 0b: point the per-call state at the host-fn registry so
1413        // a source-lowered `Op::CallNative` resolves through
1414        // `relon_llvm_call_native`. The registry lives on the evaluator
1415        // behind an `Arc` and outlives this dispatch.
1416        // SAFETY: `self.host_fns` is kept alive for the whole call (and
1417        // the evaluator's lifetime); the per-call state is the sole
1418        // owner of the `UnsafeCell` for the dispatch's duration.
1419        unsafe {
1420            state.install_host_fns(Arc::as_ptr(&self.host_fns));
1421        }
1422        let state_ptr: *const ArenaState = &state;
1423
1424        // SAFETY: same pattern as the cranelift backend's
1425        // `invoke_buffer_entry`. The JIT entry was emitted with the
1426        // canonical buffer-protocol signature; the cached fn pointer
1427        // is alive for the engine's lifetime. The arena slice
1428        // `live_arena` outlives the JIT call.
1429        let bytes_written = {
1430            let f: BufferEntryFn = unsafe { std::mem::transmute(self.jit.entry_ptr) };
1431            std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| unsafe {
1432                f(
1433                    state_ptr,
1434                    in_ptr as i32,
1435                    in_len as i32,
1436                    out_ptr as i32,
1437                    out_cap as i32,
1438                    /*caps=*/ self.caps_mask,
1439                )
1440            }))
1441            .map_err(|_| RuntimeError::Unsupported {
1442                reason: "llvm-aot: JIT entry panicked (no trap-code recovery in Phase B)".into(),
1443            })?
1444        };
1445
1446        // Phase 0b: a `CheckCap` deny or a failed `CallNative` dispatch
1447        // returns the negative sentinel and records the precise cause in
1448        // `state.trap_code`. Lift it to a typed `RuntimeError` (the same
1449        // outcome class the cranelift backend surfaces) before the
1450        // generic negative-bytes_written path.
1451        let trap_code = state.trap_code();
1452        if trap_code != 0 {
1453            return Err(crate::state::NativeTrap::runtime_error_from_code(trap_code));
1454        }
1455        // Decode the buffer return out of the arena. The decode is
1456        // backend-shared and arena-source-agnostic (host JIT arena here;
1457        // wasm linear memory in the wasm-evaluator path) — see
1458        // [`Self::decode_buffer_return`].
1459        self.decode_buffer_return(
1460            schema,
1461            arena,
1462            ArenaRegions {
1463                const_data_len: self.const_data.len(),
1464                in_ptr,
1465                in_len,
1466                out_ptr,
1467                out_cap,
1468                scratch_base,
1469                arena_size,
1470            },
1471            bytes_written,
1472        )
1473    }
1474
1475    /// Decode a buffer-protocol return out of an arena, given the raw
1476    /// i32 the entry returned (`bytes_written` / sentinel) and the arena
1477    /// region boundaries.
1478    ///
1479    /// This is the **single** post-call decode the native JIT path and
1480    /// the wasm-evaluator path share. It is deliberately source-agnostic:
1481    /// `arena` is just `&[u8]` (the host JIT arena, or a slice of wasm
1482    /// linear memory rebased to the arena origin), and every region
1483    /// offset in `regions` is arena-relative, so the wasm host can hand
1484    /// the same view and offsets the JIT path computes.
1485    ///
1486    /// Two paths, identical to the historical inline decode:
1487    /// - **negative** `ret`: the in-place region-walk sentinel
1488    ///   `-(root_abs + 1)`. We recover `root_abs`, then defer entirely to
1489    ///   the backend-shared `relon_eval_api::inplace_return` pipeline
1490    ///   (region-select → **verifier** → in-place decode). The verifier
1491    ///   is non-negotiable: an unverified buffer is never decoded, on the
1492    ///   wasm linear-memory path exactly as on the host path.
1493    /// - **non-negative** `ret`: the fixed-area / tail-cursor return; the
1494    ///   `BufferReader` walks `out_buf`.
1495    fn decode_buffer_return(
1496        &self,
1497        schema: &BufferSchema,
1498        arena: &[u8],
1499        regions: ArenaRegions,
1500        ret: i32,
1501    ) -> Result<Value, RuntimeError> {
1502        // In-place region-walk return ABI (S2): a negative return value
1503        // is the in-place sentinel `-(root_abs + 1)`. Instead of a value
1504        // copied into `out_buf`, the machine code reports the
1505        // arena-relative offset of the return root — a `List<List<scalar>>`,
1506        // `List<String>`, or `List<Schema>` value sourced from a `#main`
1507        // parameter identity.
1508        // We rebase it to its source region, run the bounds verifier over
1509        // the whole reachable graph confined to that region, and only on
1510        // a clean verify decode the value in place. A verifier failure is
1511        // a loud error — we never decode an unverified in-place return.
1512        // The decode pipeline (sentinel → region-select → verifier →
1513        // decode) is shared with the cranelift backend via
1514        // `relon_eval_api::inplace_return`, and reused verbatim by the
1515        // wasm host (the arena is then a slice of wasm linear memory).
1516        if ret < 0 {
1517            let root_abs = relon_eval_api::inplace_return::decode_inplace_sentinel(ret)?;
1518            if !is_single_value_wrapper(&schema.return_schema) {
1519                return Err(RuntimeError::IoError(
1520                    "llvm-aot in-place return on a non-single-value return schema".into(),
1521                ));
1522            }
1523            return relon_eval_api::inplace_return::decode_inplace_return(
1524                "llvm-aot",
1525                arena,
1526                regions,
1527                root_abs,
1528                &schema.return_schema.fields[0],
1529                &schema.return_layout,
1530                &schema.return_schema.fields,
1531            );
1532        }
1533        let bw = ret as usize;
1534
1535        let read_len = bw.max(schema.return_layout.root_size);
1536        let out_ptr = regions.out_ptr as usize;
1537        let read_end = out_ptr + read_len;
1538        if read_end > regions.arena_size || read_end > arena.len() {
1539            return Err(RuntimeError::IoError(
1540                "llvm-aot arena too small for return decode".into(),
1541            ));
1542        }
1543        let arena = &arena[..regions.arena_size.min(arena.len())];
1544        // Object / fixed-area return path: the shared central entry gates
1545        // the record through the multi-region bounds verifier BEFORE any
1546        // decode (verify → decode is enforced inside, so no object-return
1547        // caller can skip it), then walks the backend-shared object-field
1548        // reader. Under the F1 arena-absolute slot convention the object
1549        // head sits at `out_ptr` and every pointer slot it carries is an
1550        // arena-absolute offset, so the reader + verifier walk the **whole
1551        // arena** anchored at `out_ptr`. The gate confines every followed
1552        // span to one region (today all in `out`; cross-region object
1553        // fields stay capped — F1b releases them) and closes the red-line
1554        // gap where the object path previously decoded with no verifier.
1555        relon_eval_api::inplace_return::decode_object_return(
1556            "llvm-aot",
1557            arena,
1558            out_ptr,
1559            regions,
1560            &schema.return_layout,
1561            &schema.return_schema,
1562            is_single_value_wrapper(&schema.return_schema),
1563        )
1564    }
1565
1566    /// Plan a wasm buffer-protocol dispatch: pack the `#main` args into
1567    /// the input record and compute the same arena layout
1568    /// `run_main_buffer` lays for the host JIT.
1569    ///
1570    /// The wasm host (wasmtime) lays the returned [`WasmBufferDispatch`]
1571    /// into linear memory, invokes the exported buffer entry, then hands
1572    /// the post-call arena view back to [`Self::wasm_buffer_decode`]. The
1573    /// arena layout, the const-data prefix, and the input packing are
1574    /// **byte-identical** to the host path, so the wasm module — which is
1575    /// the same LLVM IR retargeted to wasm32 — observes exactly the arena
1576    /// the JIT body was emitted against. The single divergence is the
1577    /// arena's absolute base in memory (a host `Vec` vs. a wasm
1578    /// linear-memory offset), which the wasm body absorbs through its
1579    /// `arena_base` global; every offset here is arena-relative.
1580    pub fn wasm_buffer_plan(
1581        &self,
1582        args: &HashMap<String, Value>,
1583    ) -> Result<WasmBufferDispatch, RuntimeError> {
1584        let schema = self
1585            .buffer_schema
1586            .as_ref()
1587            .ok_or_else(|| RuntimeError::Unsupported {
1588                reason: "llvm-aot: wasm_buffer_plan called without schema metadata".into(),
1589            })?;
1590
1591        // Pack the input record exactly as `run_main_buffer` does.
1592        let mut builder = relon_eval_api::buffer::BufferBuilder::new(
1593            &schema.main_layout,
1594            &schema.main_schema.fields,
1595        );
1596        for field in &schema.main_schema.fields {
1597            let value = args
1598                .get(&field.name)
1599                .ok_or_else(|| RuntimeError::Unsupported {
1600                    reason: format!("llvm-aot: missing #main arg `{}`", field.name),
1601                })?;
1602            write_value_into_builder(&mut builder, field, value)?;
1603        }
1604        // F1: bake `in_ptr` into every input pointer slot (arena-absolute
1605        // convention) — identical to `run_main_buffer`, so the wasm module
1606        // (same IR retargeted) sees the same input bytes.
1607        let in_ptr_pre = relon_util::align_up(
1608            u32::try_from(self.const_data.len()).map_err(|_| {
1609                RuntimeError::IoError("llvm const-data section exceeds u32 range".into())
1610            })?,
1611            8,
1612        );
1613        let in_bytes = builder
1614            .finish_arena_absolute(in_ptr_pre)
1615            .map_err(buffer_to_runtime_error)?;
1616
1617        // Lay out the arena identically to `run_main_buffer`.
1618        let in_len = in_bytes.len() as u32;
1619        let out_root_size = schema.return_layout.root_size as u32;
1620        let needs_pointer_indirect_return = return_needs_tail_region(&schema.return_schema);
1621        let tail_cap: u32 = if needs_pointer_indirect_return {
1622            65_536
1623        } else {
1624            0
1625        };
1626        let out_cap = relon_util::align_up(out_root_size.max(8) + tail_cap + 16, 8);
1627        let const_data_len = u32::try_from(self.const_data.len()).map_err(|_| {
1628            RuntimeError::IoError("llvm const-data section exceeds u32 range".into())
1629        })?;
1630        let in_ptr = relon_util::align_up(const_data_len, 8);
1631        let out_ptr = relon_util::align_up(in_ptr + in_len, 8);
1632        let scratch_base = relon_util::align_up(out_ptr + out_cap, 8);
1633        let scratch_size: u32 = 1_048_576;
1634        let arena_size = (scratch_base + scratch_size) as usize;
1635
1636        Ok(WasmBufferDispatch {
1637            const_data: self.const_data.clone(),
1638            in_bytes,
1639            regions: ArenaRegions {
1640                const_data_len: self.const_data.len(),
1641                in_ptr,
1642                in_len,
1643                out_ptr,
1644                out_cap,
1645                scratch_base,
1646                arena_size,
1647            },
1648        })
1649    }
1650
1651    /// Decode a wasm buffer-protocol return. `arena` is a slice of the
1652    /// wasm linear memory **rebased to the arena origin** (i.e.
1653    /// `&memory[arena_abs .. arena_abs + arena_size]`), so the
1654    /// arena-relative offsets in `regions` and the arena-relative root in
1655    /// the negative sentinel resolve exactly as they do on the host JIT
1656    /// path. `ret` is the i32 the wasm entry returned.
1657    ///
1658    /// This routes through the **same** [`Self::decode_buffer_return`] the
1659    /// host path uses — the in-place sentinel still runs the
1660    /// `relon_eval_api::inplace_return` verifier over the linear-memory
1661    /// slice before any decode. There is no wasm-specific decode or
1662    /// wasm-specific verifier.
1663    pub fn wasm_buffer_decode(
1664        &self,
1665        arena: &[u8],
1666        regions: ArenaRegions,
1667        ret: i32,
1668    ) -> Result<Value, RuntimeError> {
1669        let schema = self
1670            .buffer_schema
1671            .as_ref()
1672            .ok_or_else(|| RuntimeError::Unsupported {
1673                reason: "llvm-aot: wasm_buffer_decode called without schema metadata".into(),
1674            })?;
1675        self.decode_buffer_return(schema, arena, regions, ret)
1676    }
1677}
1678
1679/// A planned wasm buffer-protocol dispatch produced by
1680/// [`LlvmAotEvaluator::wasm_buffer_plan`]: the const-data prefix, the
1681/// packed input record, and the full arena region layout. The wasm host
1682/// lays `const_data` at arena offset 0 and `in_bytes` at
1683/// `regions.in_ptr`, invokes the entry symbol it emitted, then decodes
1684/// via [`LlvmAotEvaluator::wasm_buffer_decode`].
1685#[derive(Debug, Clone)]
1686pub struct WasmBufferDispatch {
1687    /// Const-pool blob; laid at arena offset 0 (before `in_ptr`).
1688    pub const_data: Vec<u8>,
1689    /// Packed input record; laid at `regions.in_ptr`.
1690    pub in_bytes: Vec<u8>,
1691    /// Arena region boundaries (all arena-relative).
1692    pub regions: ArenaRegions,
1693}
1694
1695impl Evaluator for LlvmAotEvaluator {
1696    fn eval(&self, _node: &Node, _scope: &Arc<Scope>) -> Result<Value, RuntimeError> {
1697        Err(RuntimeError::Unsupported {
1698            reason: "llvm-aot: `eval` is not supported".into(),
1699        })
1700    }
1701
1702    fn eval_root(&self, _scope: &Arc<Scope>) -> Result<Value, RuntimeError> {
1703        Err(RuntimeError::Unsupported {
1704            reason: "llvm-aot: `eval_root` is not supported".into(),
1705        })
1706    }
1707
1708    fn run_main(&self, args: HashMap<String, Value>) -> Result<Value, RuntimeError> {
1709        // Phase D.1 dispatch-boundary fast path: try the typed entry
1710        // first. Falls through to the buffer-protocol path on
1711        // mismatch (non-Int args, schema past the Int-only envelope,
1712        // no fast entry emitted) — transparent to the host.
1713        if let Some(v) = self.try_run_main_fast(&args)? {
1714            return Ok(v);
1715        }
1716        match self.entry_shape {
1717            EntryShape::Buffer => self.run_main_buffer(args),
1718            EntryShape::LegacyI64 => {
1719                // Pack the HashMap into a positional i64 argv using
1720                // the declared parameter order.
1721                let mut argv = [0i64; MAX_LEGACY_ARITY];
1722                for (i, name) in self.param_names.iter().enumerate() {
1723                    let v = args.get(name).ok_or_else(|| RuntimeError::Unsupported {
1724                        reason: format!("llvm-aot: missing #main arg `{name}`"),
1725                    })?;
1726                    match v {
1727                        Value::Int(n) => argv[i] = *n,
1728                        other => {
1729                            return Err(RuntimeError::Unsupported {
1730                                reason: format!(
1731                                    "llvm-aot: legacy-i64 #main arg `{name}` is {} (Int only)",
1732                                    other.type_name()
1733                                ),
1734                            });
1735                        }
1736                    }
1737                }
1738                let r = self.run_main_legacy_i64(&argv[..self.entry_arity])?;
1739                Ok(Value::Int(r))
1740            }
1741        }
1742    }
1743
1744    fn force_thunk(&self, _thunk: &Arc<Thunk>) -> Result<Value, RuntimeError> {
1745        Err(RuntimeError::Unsupported {
1746            reason: "llvm-aot: `force_thunk` is not supported".into(),
1747        })
1748    }
1749
1750    fn invoke_closure(
1751        &self,
1752        _closure: &ClosureData,
1753        _args: &[Value],
1754    ) -> Result<Value, RuntimeError> {
1755        Err(RuntimeError::Unsupported {
1756            reason: "llvm-aot: `invoke_closure` is not supported".into(),
1757        })
1758    }
1759}
1760
1761// ---------------------------------------------------------------------------
1762// Buffer-protocol packing / unpacking helpers.
1763//
1764// These mirror what `relon-codegen-cranelift::evaluator` does for
1765// `write_value_into_builder` / `is_single_value_wrapper` /
1766// `buffer_to_runtime_error`. The object-return *decode* side is no
1767// longer mirrored per crate — it lives once in
1768// `relon_eval_api::inplace_return::decode_object_return`. Kept inside
1769// this crate so the LLVM backend has no compile-time dep on
1770// cranelift-native.
1771// ---------------------------------------------------------------------------
1772
1773fn buffer_to_runtime_error(e: relon_eval_api::buffer::BufferError) -> RuntimeError {
1774    RuntimeError::IoError(format!("llvm-aot buffer: {e}"))
1775}
1776
1777fn is_single_value_wrapper(schema: &relon_eval_api::schema_canonical::Schema) -> bool {
1778    schema.name == relon_ir::MAIN_RETURN_SCHEMA_NAME
1779        && schema.fields.len() == 1
1780        && schema.fields[0].name == relon_ir::RETURN_VALUE_FIELD_NAME
1781}
1782
1783/// Phase D.2: looser sibling of [`is_single_value_wrapper`] used to
1784/// gate the typed-i64 fast-path. Accepts any single-field record whose
1785/// sole field is `Int` — the canonical `Ret { value: Int }` wrapper
1786/// **and** any user-declared `#main(...) -> Dict` whose anon-record
1787/// lowering collapsed to one `Int` field (W7's `{ result: Int }` is
1788/// the motivating case).
1789///
1790/// The strict [`is_single_value_wrapper`] check stays in place for the
1791/// `run_main` buffer decoder — branded user dicts must still surface
1792/// as `Value::Dict` for the host, not be unwrapped to a bare scalar.
1793fn is_single_int_field_record(schema: &relon_eval_api::schema_canonical::Schema) -> bool {
1794    use relon_eval_api::schema_canonical::TypeRepr;
1795    // A tuple schema (`is_tuple`) decodes positionally to a `Value::Tuple`,
1796    // never to a scalar / branded dict — so a 1-tuple
1797    // `Tuple<Int>` must NOT take the typed-i64 fast path (which would
1798    // return the wrong container shape). Force it onto the buffer path so
1799    // the shared `decode_object_return` tuple fork runs.
1800    !schema.is_tuple && schema.fields.len() == 1 && matches!(schema.fields[0].ty, TypeRepr::Int)
1801}
1802
1803/// Marshal a typed [`Value`] into the buffer slot for `field` on the
1804/// way *into* the JIT body (host → arena).
1805///
1806/// ## marshalling-seam contract (host side)
1807///
1808/// This dispatcher is one of the per-type marshalling seams S1.A
1809/// carved out so each leaf type owns a private `marshal_<type>_in`
1810/// helper rather than living inline in a single fat `match`. Adding a
1811/// new leaf type means: (1) add an arm here delegating to a new
1812/// `marshal_<type>_in`, (2) add the symmetric arm to the shared
1813/// object-return decoder `relon_eval_api::inplace_return` (reached via
1814/// `decode_object_return`), and (3) widen the build.rs-visible
1815/// [`EmittedFieldType`] triple (see that enum's docs).
1816///
1817/// Note: MCJIT already marshals `Float` / `Schema` here; the
1818/// build.rs-visible [`EmittedFieldType`] surface is the *narrower* set
1819/// (see [`lower_field_descriptors`]). Keep the two in mind separately —
1820/// this seam is the runtime marshaller, `EmittedFieldType` is the
1821/// AOT-binding signature surface.
1822fn write_value_into_builder(
1823    builder: &mut relon_eval_api::buffer::BufferBuilder<'_>,
1824    field: &relon_eval_api::schema_canonical::Field,
1825    value: &Value,
1826) -> Result<(), RuntimeError> {
1827    use relon_eval_api::schema_canonical::TypeRepr;
1828    match (&field.ty, value) {
1829        (TypeRepr::Int, Value::Int(v)) => marshal_int_in(builder, &field.name, *v),
1830        (TypeRepr::Float, Value::Float(v)) => {
1831            marshal_float_in(builder, &field.name, v.into_inner())
1832        }
1833        (TypeRepr::Float, Value::Int(v)) => marshal_float_in(builder, &field.name, *v as f64),
1834        (TypeRepr::Bool, Value::Bool(v)) => marshal_bool_in(builder, &field.name, *v),
1835        (TypeRepr::Unit, v) if v.is_option_none() => marshal_unit_in(builder, &field.name),
1836        (TypeRepr::String, Value::String(s)) => marshal_string_in(builder, &field.name, s),
1837        (TypeRepr::Schema { schema }, Value::Dict(dict)) if !schema.is_tuple => {
1838            marshal_schema_in(builder, &field.name, schema, dict)
1839        }
1840        (TypeRepr::Schema { schema }, Value::Tuple(items)) if schema.is_tuple => {
1841            marshal_tuple_in(builder, &field.name, schema, items.as_ref())
1842        }
1843        (TypeRepr::List { element }, Value::List(items)) => {
1844            marshal_list_in(builder, &field.name, element, items)
1845        }
1846        (TypeRepr::Option { .. } | TypeRepr::Result { .. } | TypeRepr::Enum { .. }, _) => builder
1847            .write_value(&field.name, &field.ty, value)
1848            .map_err(buffer_to_runtime_error),
1849        // ----- add new leaf marshalling arm above this line -----
1850        (ty, v) => Err(RuntimeError::Unsupported {
1851            reason: format!(
1852                "llvm-aot: #main arg `{}` got {} but schema expects {ty:?}",
1853                field.name,
1854                v.type_name()
1855            ),
1856        }),
1857    }
1858}
1859
1860// --- per-variant host-side input marshalling helpers (S1.A seam) ---
1861//
1862// One `marshal_<type>_in` per leaf type. Future Float/List lanes fill
1863// their own helper here without touching sibling arms.
1864
1865fn marshal_int_in(
1866    builder: &mut relon_eval_api::buffer::BufferBuilder<'_>,
1867    name: &str,
1868    v: i64,
1869) -> Result<(), RuntimeError> {
1870    builder.write_int(name, v).map_err(buffer_to_runtime_error)
1871}
1872
1873fn marshal_float_in(
1874    builder: &mut relon_eval_api::buffer::BufferBuilder<'_>,
1875    name: &str,
1876    v: f64,
1877) -> Result<(), RuntimeError> {
1878    builder
1879        .write_float(name, v)
1880        .map_err(buffer_to_runtime_error)
1881}
1882
1883fn marshal_bool_in(
1884    builder: &mut relon_eval_api::buffer::BufferBuilder<'_>,
1885    name: &str,
1886    v: bool,
1887) -> Result<(), RuntimeError> {
1888    builder.write_bool(name, v).map_err(buffer_to_runtime_error)
1889}
1890
1891fn marshal_unit_in(
1892    builder: &mut relon_eval_api::buffer::BufferBuilder<'_>,
1893    name: &str,
1894) -> Result<(), RuntimeError> {
1895    builder.write_unit(name).map_err(buffer_to_runtime_error)
1896}
1897
1898/// Top-level / schema `String` `#main` arg marshalling. The
1899/// pointer-indirect `BufferBuilder::write_string` appends a
1900/// `[len: u32 LE][utf8]` record into the parent buffer's tail area and
1901/// back-patches the 4-byte buffer-relative offset slot the JIT's
1902/// `LoadStringPtr` reads — the same record shape `ConstString` bakes.
1903fn marshal_string_in(
1904    builder: &mut relon_eval_api::buffer::BufferBuilder<'_>,
1905    name: &str,
1906    s: &str,
1907) -> Result<(), RuntimeError> {
1908    builder
1909        .write_string(name, s)
1910        .map_err(buffer_to_runtime_error)
1911}
1912
1913/// `List<…>` `#main` arg marshalling. Dispatches on the canonical
1914/// element type to the matching pointer-indirect `write_list_*` writer,
1915/// each of which appends the tail record (`[len][payload]` for scalar
1916/// elements, a `[len][off_0]…` pointer array of `[len][utf8]` String
1917/// records for `List<String>`) into the parent buffer's tail area and
1918/// back-patches the 4-byte buffer-relative offset slot the JIT's
1919/// `LoadList*Ptr` / pointer-indirect `LoadFieldAtAbsolute` reads — the
1920/// same shapes the ConstPool `add_list_*` blobs bake, so a list `#main`
1921/// arg and a const list return share one tail-record protocol. Element
1922/// `Value`s are type-checked against the declared element type;
1923/// `List<Schema>` (and any other element) stays a loud cap.
1924fn marshal_list_in(
1925    builder: &mut relon_eval_api::buffer::BufferBuilder<'_>,
1926    name: &str,
1927    element: &relon_eval_api::schema_canonical::TypeRepr,
1928    items: &[Value],
1929) -> Result<(), RuntimeError> {
1930    use relon_eval_api::schema_canonical::TypeRepr;
1931    let mismatch = |idx: usize, got: &Value, want: &str| RuntimeError::Unsupported {
1932        reason: format!(
1933            "llvm-aot: List<{want}> arg `{name}` element #{idx} got {} but expects {want}",
1934            got.type_name()
1935        ),
1936    };
1937    match element {
1938        TypeRepr::Int => {
1939            let mut out = Vec::with_capacity(items.len());
1940            for (i, it) in items.iter().enumerate() {
1941                match it {
1942                    Value::Int(v) => out.push(*v),
1943                    other => return Err(mismatch(i, other, "Int")),
1944                }
1945            }
1946            builder
1947                .write_list_int(name, &out)
1948                .map_err(buffer_to_runtime_error)
1949        }
1950        TypeRepr::Float => {
1951            let mut out = Vec::with_capacity(items.len());
1952            for (i, it) in items.iter().enumerate() {
1953                match it {
1954                    Value::Float(v) => out.push(v.into_inner()),
1955                    Value::Int(v) => out.push(*v as f64),
1956                    other => return Err(mismatch(i, other, "Float")),
1957                }
1958            }
1959            builder
1960                .write_list_float(name, &out)
1961                .map_err(buffer_to_runtime_error)
1962        }
1963        TypeRepr::Bool => {
1964            let mut out = Vec::with_capacity(items.len());
1965            for (i, it) in items.iter().enumerate() {
1966                match it {
1967                    Value::Bool(v) => out.push(*v),
1968                    other => return Err(mismatch(i, other, "Bool")),
1969                }
1970            }
1971            builder
1972                .write_list_bool(name, &out)
1973                .map_err(buffer_to_runtime_error)
1974        }
1975        TypeRepr::String => {
1976            let mut out: Vec<&str> = Vec::with_capacity(items.len());
1977            for (i, it) in items.iter().enumerate() {
1978                match it {
1979                    Value::String(s) => out.push(s.as_str()),
1980                    other => return Err(mismatch(i, other, "String")),
1981                }
1982            }
1983            builder
1984                .write_list_string(name, &out)
1985                .map_err(buffer_to_runtime_error)
1986        }
1987        TypeRepr::Schema { schema } => marshal_list_schema_in(builder, name, schema, items),
1988        TypeRepr::List { element: inner } => marshal_list_list_in(builder, name, inner, items),
1989        TypeRepr::Option { .. } | TypeRepr::Result { .. } | TypeRepr::Enum { .. } => {
1990            let ty = TypeRepr::List {
1991                element: Box::new(element.clone()),
1992            };
1993            builder
1994                .write_value(name, &ty, &Value::List(Arc::new(items.to_vec())))
1995                .map_err(buffer_to_runtime_error)
1996        }
1997        other => Err(RuntimeError::Unsupported {
1998            reason: format!(
1999                "llvm-aot: List element type {other:?} for arg `{name}` is not yet materialised \
2000                 (List<Int/Float/Bool/String/Schema> + List<List<scalar>>)"
2001            ),
2002        }),
2003    }
2004}
2005
2006/// Marshal a `List<Schema>` arg: each element is a branded
2007/// `Value::Dict` written as a sub-record into the parent buffer's tail
2008/// through [`relon_eval_api::buffer::ListRecordWriter`]. The list
2009/// header's per-entry offsets and the inner sub-records' own pointer
2010/// slots are relocated into the parent's coordinate system by
2011/// `finish_entry` / `finish_list_record`. Mirrors the cranelift backend.
2012fn marshal_list_schema_in(
2013    builder: &mut relon_eval_api::buffer::BufferBuilder<'_>,
2014    name: &str,
2015    schema: &relon_eval_api::schema_canonical::Schema,
2016    items: &[Value],
2017) -> Result<(), RuntimeError> {
2018    let elem_layout = relon_eval_api::layout::SchemaLayout::offsets_for(schema).map_err(|e| {
2019        RuntimeError::Unsupported {
2020            reason: format!("llvm-aot: List<Schema> arg `{name}` element layout: {e}"),
2021        }
2022    })?;
2023    let mut writer = builder
2024        .list_record_writer(name, &elem_layout, schema)
2025        .map_err(buffer_to_runtime_error)?;
2026    for (i, it) in items.iter().enumerate() {
2027        let mut child = writer.start_entry();
2028        match it {
2029            Value::Dict(dict) if !schema.is_tuple => {
2030                write_schema_into_builder(&mut child, schema, dict, name)?;
2031            }
2032            Value::Tuple(tuple_items) if schema.is_tuple => {
2033                write_tuple_into_builder(&mut child, schema, tuple_items.as_ref(), name)?;
2034            }
2035            other => {
2036                return Err(RuntimeError::Unsupported {
2037                    reason: format!(
2038                        "llvm-aot: List<Schema> arg `{name}` element #{i} got {} but expects {}",
2039                        other.type_name(),
2040                        if schema.is_tuple {
2041                            "a tuple"
2042                        } else {
2043                            "a branded record"
2044                        }
2045                    ),
2046                });
2047            }
2048        }
2049        writer
2050            .finish_entry(builder, child)
2051            .map_err(buffer_to_runtime_error)?;
2052    }
2053    builder
2054        .finish_list_record(writer)
2055        .map_err(buffer_to_runtime_error)
2056}
2057
2058/// Marshal a nested `List<List<scalar>>` arg. Each element is itself a
2059/// `Value::List` of inline-fixed scalars (`Int` / `Float` / `Bool`)
2060/// serialised into a `[len][payload]` inner record; the outer header is
2061/// a pointer array of offsets to those records. Mirrors the cranelift
2062/// backend; inner pointer-array element lists (`List<List<String>>`)
2063/// stay a loud cap at the layout pass.
2064fn marshal_list_list_in(
2065    builder: &mut relon_eval_api::buffer::BufferBuilder<'_>,
2066    name: &str,
2067    inner: &relon_eval_api::schema_canonical::TypeRepr,
2068    items: &[Value],
2069) -> Result<(), RuntimeError> {
2070    use relon_eval_api::schema_canonical::TypeRepr;
2071    // `List<List<scalar>>` keeps the inline-fixed inner-record writer;
2072    // `List<List<String|Schema|List>>` (F5) routes through the recursive
2073    // doubly-nested pointer-array marshaller.
2074    match inner {
2075        TypeRepr::Int | TypeRepr::Float | TypeRepr::Bool => {
2076            relon_eval_api::buffer::write_nested_scalar_list(builder, name, inner, items)
2077                .map_err(buffer_to_runtime_error)
2078        }
2079        _ => relon_eval_api::buffer::write_nested_pointer_array_list(builder, name, inner, items)
2080            .map_err(buffer_to_runtime_error),
2081    }
2082}
2083
2084/// Phase 0b: Schema-typed `#main` arg marshalling. A branded
2085/// `Value::Dict` (e.g. `#main(Outer o)`) lands here.
2086/// `BufferBuilder::sub_record` / `finish_sub_record` (eval-api
2087/// Phase 9.b-1) write the sub-record into the parent buffer's tail area
2088/// and back-patch the 4-byte buffer-relative offset slot in the fixed
2089/// area — exactly the slot `LoadSchemaPtr` reads. We recurse over the
2090/// sub-fields (including nested Inner); `finish_sub_record`'s internal
2091/// `relocate_pointers` rebases the child's own pointer slots into the
2092/// parent's coordinate system.
2093fn marshal_schema_in(
2094    builder: &mut relon_eval_api::buffer::BufferBuilder<'_>,
2095    name: &str,
2096    schema: &relon_eval_api::schema_canonical::Schema,
2097    dict: &relon_eval_api::ValueDict,
2098) -> Result<(), RuntimeError> {
2099    let sub_layout = relon_eval_api::layout::SchemaLayout::offsets_for(schema).map_err(|e| {
2100        RuntimeError::Unsupported {
2101            reason: format!("llvm-aot: schema arg `{name}` layout: {e}"),
2102        }
2103    })?;
2104    let mut child = builder
2105        .sub_record(name, &sub_layout, &schema.fields)
2106        .map_err(buffer_to_runtime_error)?;
2107    write_schema_into_builder(&mut child, schema, dict, name)?;
2108    builder
2109        .finish_sub_record(name, child)
2110        .map_err(buffer_to_runtime_error)
2111}
2112
2113/// Tuple-typed `#main` arg marshalling. A tuple is a positional record
2114/// (`schema.is_tuple`) at the binary layer, with a `Value::Tuple` host
2115/// shape at the API layer.
2116fn marshal_tuple_in(
2117    builder: &mut relon_eval_api::buffer::BufferBuilder<'_>,
2118    name: &str,
2119    schema: &relon_eval_api::schema_canonical::Schema,
2120    items: &[Value],
2121) -> Result<(), RuntimeError> {
2122    let sub_layout = relon_eval_api::layout::SchemaLayout::offsets_for(schema).map_err(|e| {
2123        RuntimeError::Unsupported {
2124            reason: format!("llvm-aot: tuple arg `{name}` layout: {e}"),
2125        }
2126    })?;
2127    let mut child = builder
2128        .sub_record(name, &sub_layout, &schema.fields)
2129        .map_err(buffer_to_runtime_error)?;
2130    write_tuple_into_builder(&mut child, schema, items, name)?;
2131    builder
2132        .finish_sub_record(name, child)
2133        .map_err(buffer_to_runtime_error)
2134}
2135
2136/// Recursively fill `child` (a detached sub-record builder) with the
2137/// fields of `schema`, pulling each value out of the branded `dict`.
2138/// Nested `Schema`-typed fields recurse through
2139/// [`write_value_into_builder`]'s Schema arm, which re-enters this
2140/// helper one layer down.
2141///
2142/// `parent_field` is only used for error messages so a missing nested
2143/// field names its enclosing slot.
2144fn write_schema_into_builder(
2145    child: &mut relon_eval_api::buffer::BufferBuilder<'_>,
2146    schema: &relon_eval_api::schema_canonical::Schema,
2147    dict: &relon_eval_api::ValueDict,
2148    parent_field: &str,
2149) -> Result<(), RuntimeError> {
2150    for sub_field in &schema.fields {
2151        let sub_value =
2152            dict.map
2153                .get(sub_field.name.as_str())
2154                .ok_or_else(|| RuntimeError::Unsupported {
2155                    reason: format!(
2156                        "llvm-aot: schema arg `{parent_field}` is missing field `{}`",
2157                        sub_field.name
2158                    ),
2159                })?;
2160        write_value_into_builder(child, sub_field, sub_value)?;
2161    }
2162    Ok(())
2163}
2164
2165/// Recursively fill `child` from a tuple value, pairing positional items
2166/// with the tuple schema's synthetic `"0"`, `"1"`, ... fields.
2167fn write_tuple_into_builder(
2168    child: &mut relon_eval_api::buffer::BufferBuilder<'_>,
2169    schema: &relon_eval_api::schema_canonical::Schema,
2170    items: &[Value],
2171    parent_field: &str,
2172) -> Result<(), RuntimeError> {
2173    if items.len() != schema.fields.len() {
2174        return Err(RuntimeError::Unsupported {
2175            reason: format!(
2176                "llvm-aot: tuple arg `{parent_field}` has arity {} but schema expects {}",
2177                items.len(),
2178                schema.fields.len()
2179            ),
2180        });
2181    }
2182    for (sub_field, sub_value) in schema.fields.iter().zip(items.iter()) {
2183        write_value_into_builder(child, sub_field, sub_value)?;
2184    }
2185    Ok(())
2186}
2187
2188// The object-return field decode (`read_value_from_reader` /
2189// `read_record_into_map` and the per-type `marshal_*_out` seam) now
2190// lives once in `relon_eval_api::inplace_return` and is reached through
2191// `decode_object_return`; both AOT backends share that single copy, so a
2192// new return field type is added in exactly one place.
2193
2194/// Phase E.1: does the return schema include any pointer-indirect
2195/// type (`String` / `List*`)? Drives the output buffer's tail-cap
2196/// sizing — fixed-area-only returns don't need the 64 KiB cushion.
2197fn return_needs_tail_region(schema: &relon_eval_api::schema_canonical::Schema) -> bool {
2198    use relon_eval_api::schema_canonical::TypeRepr;
2199    schema.fields.iter().any(|f| {
2200        matches!(
2201            f.ty,
2202            TypeRepr::String
2203                | TypeRepr::List { .. }
2204                | TypeRepr::Schema { .. }
2205                | TypeRepr::Option { .. }
2206                | TypeRepr::Result { .. }
2207                | TypeRepr::Enum { .. }
2208        )
2209    })
2210}
2211
2212/// Phase D.1 / D.2: discover whether `schema` qualifies for the typed
2213/// fast-path entry. Eligibility requires every declared `#main` arg
2214/// to be `Int` (Inline scalar at 8 / 8) and the return record to
2215/// carry a single `Int` field — either the canonical
2216/// `Ret { value: Int }` wrapper (Phase D.1) or any user-declared
2217/// `#main(...) -> Dict` whose anon-record lowering collapsed to one
2218/// `Int` field (Phase D.2 — W7's `{ result: Int }` is the motivating
2219/// shape). Returns the `FastPathProfile` mapping param-declaration
2220/// Whether the typed `(i64..) -> i64` fast entry can lower `entry`'s
2221/// body. The fast entry runs with **no `*state` pointer and an empty
2222/// const-pool** (see `emit_fast_entry`), so any op that resolves
2223/// against the arena-prefix const-pool — `Op::ConstString` and the
2224/// `Op::ConstList*` family — cannot be materialised on it. Such a body
2225/// must take the buffer entry even when its `#main` schema is otherwise
2226/// fast-eligible (W4: `Int -> Int` schema over a `"axb"` string
2227/// literal). Returns `false` if any reachable op references the pool.
2228///
2229/// This is the object-emit analogue of MCJIT's
2230/// emit-fast-then-roll-back-on-failure dance: rather than emit a fast
2231/// entry, watch it fail, and delete it, we predict the failure here and
2232/// route straight to the buffer entry (the object module has no second
2233/// "buffer entry also present" fallback to fall onto).
2234fn fast_entry_emittable(entry: &relon_ir::ir::Func) -> bool {
2235    !body_references_const_pool(&entry.body)
2236}
2237
2238fn body_may_raise_typed_trap(body: &[relon_ir::ir::TaggedOp]) -> bool {
2239    use relon_ir::ir::{IrType, Op};
2240    for tagged in body {
2241        let hit = match &tagged.op {
2242            Op::Add(IrType::I64)
2243            | Op::Sub(IrType::I64)
2244            | Op::Mul(IrType::I64)
2245            | Op::Div(IrType::I64)
2246            | Op::Mod(IrType::I64)
2247            | Op::Trap { .. }
2248            | Op::CheckCap { .. }
2249            | Op::CallNative { .. } => true,
2250            Op::Block { body, .. } | Op::Loop { body, .. } => body_may_raise_typed_trap(body),
2251            Op::If {
2252                then_body,
2253                else_body,
2254                ..
2255            } => body_may_raise_typed_trap(then_body) || body_may_raise_typed_trap(else_body),
2256            Op::Call { fn_index, .. } => {
2257                let stdlib = relon_ir::stdlib::builtin_stdlib();
2258                stdlib
2259                    .get(*fn_index as usize)
2260                    .map(|callee| body_may_raise_typed_trap(&callee.body_owned()))
2261                    .unwrap_or(true)
2262            }
2263            _ => false,
2264        };
2265        if hit {
2266            return true;
2267        }
2268    }
2269    false
2270}
2271
2272fn body_references_const_pool(body: &[relon_ir::ir::TaggedOp]) -> bool {
2273    use relon_ir::ir::Op;
2274    for tagged in body {
2275        let hit = match &tagged.op {
2276            Op::ConstString { .. }
2277            | Op::ConstListInt { .. }
2278            | Op::ConstListFloat { .. }
2279            | Op::ConstListBool { .. }
2280            | Op::ConstListString { .. } => true,
2281            Op::Block { body, .. } | Op::Loop { body, .. } => body_references_const_pool(body),
2282            Op::If {
2283                then_body,
2284                else_body,
2285                ..
2286            } => body_references_const_pool(then_body) || body_references_const_pool(else_body),
2287            // `Op::Call` inlines a bundled-stdlib body whose own const-
2288            // pool ops would resolve against the same (empty, on the fast
2289            // entry) pool. Mirror `ConstPool::collect_op`'s stdlib
2290            // recursion so a stdlib body that bakes a literal also forces
2291            // the buffer entry.
2292            Op::Call { fn_index, .. } => {
2293                let stdlib = relon_ir::stdlib::builtin_stdlib();
2294                stdlib
2295                    .get(*fn_index as usize)
2296                    .map(|callee| body_references_const_pool(&callee.body_owned()))
2297                    .unwrap_or(false)
2298            }
2299            _ => false,
2300        };
2301        if hit {
2302            return true;
2303        }
2304    }
2305    false
2306}
2307
2308/// P3 §2.2 wasm closed-world routing: derive a per-`import_idx`
2309/// effectful flag from the IR's `Op::CheckCap` → `Op::CallNative` shape.
2310///
2311/// The IR lowering (`try_lower_native_call`) emits one `Op::CheckCap`
2312/// per capability bit a host fn's gate requires *immediately before* the
2313/// call's argument evaluation, then the `Op::CallNative`. A **pure**
2314/// host fn (empty gate) emits zero preceding CheckCaps; an **effectful**
2315/// one (reads clock / IO / side effect — gated by a capability) emits at
2316/// least one. The `NativeImport.cap_bit` carried into codegen is always
2317/// `NO_CAPABILITY_BIT` (the guard rides the CheckCap ops, not the call),
2318/// so this CheckCap-presence scan is the in-codegen signal that survives
2319/// IR lowering — no analyzer/IR change required.
2320///
2321/// Returns `effectful[i] == true` iff import index `i`'s call site is
2322/// guarded by a preceding CheckCap. Walks every function body
2323/// (entry + helpers + lambdas), maintaining a per-body count of pending
2324/// CheckCaps consumed by the next CallNative. A pure call nested inside
2325/// an effectful call's arguments carries no CheckCap of its own, so it
2326/// won't be mis-flagged.
2327fn compute_effectful_imports(ir: &relon_ir::ir::Module) -> Vec<bool> {
2328    let mut effectful = vec![false; ir.imports.len()];
2329    for func in &ir.funcs {
2330        scan_body_effectful(&func.body, &mut effectful);
2331    }
2332    effectful
2333}
2334
2335fn scan_body_effectful(body: &[relon_ir::ir::TaggedOp], effectful: &mut [bool]) {
2336    use relon_ir::ir::Op;
2337    // Pending CheckCaps in declaration order ahead of the next CallNative
2338    // in this op sequence. The lowering pins them right before the call's
2339    // args, so a non-zero count when a CallNative is reached marks that
2340    // import effectful.
2341    let mut pending_check_caps: u32 = 0;
2342    for tagged in body {
2343        match &tagged.op {
2344            Op::CheckCap { .. } => pending_check_caps += 1,
2345            Op::CallNative { import_idx, .. } => {
2346                if pending_check_caps > 0 {
2347                    if let Some(slot) = effectful.get_mut(*import_idx as usize) {
2348                        *slot = true;
2349                    }
2350                }
2351                pending_check_caps = 0;
2352            }
2353            // Nested control flow: recurse so a CheckCap-guarded call
2354            // inside a branch / loop is still flagged. A nested block
2355            // starts its own pending count.
2356            Op::Block { body, .. } | Op::Loop { body, .. } => {
2357                scan_body_effectful(body, effectful);
2358            }
2359            Op::If {
2360                then_body,
2361                else_body,
2362                ..
2363            } => {
2364                scan_body_effectful(then_body, effectful);
2365                scan_body_effectful(else_body, effectful);
2366            }
2367            _ => {}
2368        }
2369    }
2370}
2371
2372/// order to buffer offsets when eligible.
2373fn build_fast_path_profile(schema: &BufferSchema) -> Result<FastPathProfile, ()> {
2374    use relon_eval_api::schema_canonical::TypeRepr;
2375    // Every declared #main arg must be `Int`. Pointer-indirect /
2376    // floating-point / bool / unit are out — those would require
2377    // f64 / i32 fast-entry slots we don't enumerate.
2378    for f in &schema.main_schema.fields {
2379        if !matches!(f.ty, TypeRepr::Int) {
2380            return Err(());
2381        }
2382    }
2383    // Single-Int-field record return only. Any other shape
2384    // (multi-field record, branded sub-schema with non-Int leaves,
2385    // tail-cursor String/List) escapes the typed-i64 envelope.
2386    if !is_single_int_field_record(&schema.return_schema) {
2387        return Err(());
2388    }
2389    // Collect each arg's buffer offset from the layout — declaration
2390    // order is what the JIT entry is parameterised by.
2391    let mut arg_offsets: Vec<u32> = Vec::with_capacity(schema.main_layout.fields.len());
2392    for (i, f) in schema.main_schema.fields.iter().enumerate() {
2393        // Layout's `fields` mirrors `main_schema.fields` order; cross-
2394        // check the names so a future schema reorder surfaces.
2395        let lo = schema.main_layout.fields.get(i).ok_or(())?;
2396        if lo.name != f.name {
2397            return Err(());
2398        }
2399        arg_offsets.push(lo.offset as u32);
2400    }
2401    // Arity cap — matches `emit_fast_entry`'s `arity > 8` guard.
2402    if arg_offsets.len() > 8 {
2403        return Err(());
2404    }
2405    let ret_offset = schema
2406        .return_layout
2407        .fields
2408        .first()
2409        .map(|f| f.offset as u32)
2410        .ok_or(())?;
2411    Ok(FastPathProfile {
2412        arg_offsets,
2413        ret_offset,
2414    })
2415}
2416
2417/// Run LLVM's `-O3` middle-end pipeline on `module`. The host-side
2418/// JIT engine handles backend codegen-time optimisation; this
2419/// function fills in the IR-level passes (mem2reg, instcombine, gvn,
2420/// licm, loop-unroll, SLP-vectorize, …) that MCJIT does not invoke
2421/// on its own.
2422///
2423/// The implementation lazily initialises LLVM's native target the
2424/// first time it is called — required by `Target::from_triple` /
2425/// `create_target_machine`. Subsequent calls re-use the initialised
2426/// target state.
2427/// Which ABI shape the emitted entry symbol exposes. Drives the
2428/// build.rs binding-generator's choice between a typed `(i64...) -> i64`
2429/// extern declaration (fast path) and a buffer-protocol call through
2430/// `relon-rs-shims::call_buffer_entry`.
2431#[derive(Debug, Clone, Copy, PartialEq, Eq)]
2432pub enum EmittedEntryShape {
2433    /// `extern "C" fn(i64, ...) -> i64`. Source qualified for the
2434    /// dispatch-boundary fast path (Int-only `#main(Int...) -> Int`,
2435    /// arity <= 8, no string/list/closure). The binding wraps the
2436    /// extern with a thin Rust shim.
2437    FastInt,
2438    /// Full buffer-protocol entry:
2439    /// `extern "C" fn(*const ArenaState, i32, i32, i32, i32, i64) -> i32`.
2440    /// Source has string/list arguments or returns, calls into
2441    /// stdlib helpers, or uses helper functions. The binding marshals
2442    /// typed Rust args into / out of an arena buffer through
2443    /// `relon-rs-shims::call_buffer_entry`.
2444    Buffer,
2445}
2446
2447/// One declared `#main` parameter (or `value` field on the return
2448/// schema), in declaration order. Tells the build.rs binding generator
2449/// what Rust type to expose for each slot and at what byte offset the
2450/// buffer-protocol arena writer / reader should access it.
2451#[derive(Debug, Clone)]
2452pub struct EmittedField {
2453    /// Field name as declared in source.
2454    pub name: String,
2455    /// Pre-computed byte offset of the slot inside its enclosing
2456    /// fixed area (main_params record for args, return record for
2457    /// the return slot).
2458    pub offset: u32,
2459    /// Erased canonical type tag. Build.rs maps each to the matching
2460    /// Rust type for the binding signature.
2461    pub ty: EmittedFieldType,
2462}
2463
2464/// Erased canonical type tag the build.rs binding generator uses to
2465/// pick the Rust type for each `#main` parameter / return slot.
2466///
2467/// Phase 2 covers `Int` / `Bool` / `String` / internal unit slots. Float, Lists,
2468/// nested schemas, and closure-valued returns surface as
2469/// `UnsupportedSignature` at emit-object time so the binding never
2470/// sees a type tag it can't handle.
2471///
2472/// ## Three-crate triple contract
2473///
2474/// This tag is the byte-for-byte-identical seam shared by three crates;
2475/// the enum is mirrored (not shared) so the runtime shim and build
2476/// generator don't take a dep on this codegen crate:
2477///
2478/// 1. `relon_codegen_llvm` (this enum) — produced by
2479///    [`lower_field_descriptors`].
2480/// 2. `relon_rs_shims::EmittedFieldType` — the runtime mirror;
2481///    `call_buffer_entry` packs/unpacks per variant.
2482/// 3. `relon_rs_build` — `rust_type_for` maps each variant to the Rust
2483///    surface type + `ArgValue` / `RetValue` constructor.
2484///
2485/// **Adding a variant is a four-touch change**: (1) add the variant
2486/// here + its arm in [`lower_field_descriptors`]; (2) add the mirror
2487/// variant + the `*_in` / `*_out` sibling helpers in
2488/// `relon_rs_shims::marshal`; (3) add the `rust_type_for` table row in
2489/// `relon_rs_build`; (4) extend the cross-crate round-trip guard test.
2490/// The guard test in `relon-rs-build/tests/marshal_roundtrip.rs` fails
2491/// closed if any of the three drift.
2492#[derive(Debug, Clone, Copy, PartialEq, Eq)]
2493pub enum EmittedFieldType {
2494    /// `i64`. Inline slot at offset, 8/8.
2495    Int,
2496    /// `f64`. Inline slot at offset, 8/8 (8 LE bytes, IEEE-754).
2497    Float,
2498    /// `bool`. Inline slot at offset, 1/1.
2499    Bool,
2500    /// `()`. Inline slot at offset, 1/1 (always reads as zero).
2501    Unit,
2502    /// `&str` / `String`. Pointer-indirect: fixed slot is a 4-byte
2503    /// buffer-relative offset to a `[len: u32 LE][utf8 bytes]` tail
2504    /// record. Build.rs uses `BufferBuilder::write_string` to pack
2505    /// inputs and `BufferReader::read_string` to decode outputs.
2506    String,
2507    /// `&[i64]` / `Vec<i64>`. Pointer-indirect (like `String`): the
2508    /// fixed slot is a 4-byte buffer-relative offset to a
2509    /// `[len: u32 LE][pad to 8][i64 LE …]` tail record (8/8-inline
2510    /// elements, byte-identical to the ConstPool `add_list_int` blob).
2511    /// Build.rs uses `BufferBuilder::write_list_int` to pack inputs and
2512    /// `BufferReader::read_list_int` to decode outputs.
2513    ListInt,
2514}
2515
2516/// Metadata returned by [`LlvmAotEvaluator::emit_object`] so the
2517/// build.rs caller can stamp matching `extern "C"` declarations and
2518/// marshalling code into the generated Rust shim.
2519///
2520/// The shape carried by [`Self::shape`] decides the binding shape:
2521/// fast-path entries get a thin `extern "C" fn(i64, ...) -> i64`
2522/// wrapper; buffer-protocol entries route through
2523/// `relon-rs-shims::call_buffer_entry` with typed Rust args.
2524#[derive(Debug, Clone)]
2525pub struct EmitObjectInfo {
2526    /// Exported C ABI symbol name (chosen by the caller; the emitter
2527    /// renames the JIT-side default to this).
2528    pub entry_symbol: String,
2529    /// Number of declared `#main` parameters. For fast-path entries
2530    /// this equals the C ABI arity; for buffer-protocol entries the C
2531    /// ABI arity is always 6, while this field reports the
2532    /// user-visible `#main` arity.
2533    pub entry_arity: usize,
2534    /// Declared parameter names in `#main(...)` declaration order.
2535    /// Build.rs uses these to name the Rust shim's args.
2536    pub param_names: Vec<String>,
2537    /// Which extern signature the emitted symbol carries. Drives the
2538    /// binding generator's dispatch shape.
2539    pub shape: EmittedEntryShape,
2540    /// Declared `#main` parameters with byte-offsets and type tags.
2541    /// Used by the buffer-protocol binding to pack input args into
2542    /// the arena. Empty under [`EmittedEntryShape::FastInt`] (the
2543    /// fast path reads args from positional registers, not the
2544    /// buffer).
2545    pub main_fields: Vec<EmittedField>,
2546    /// Return record fields. Phase 2 lowering always wraps the
2547    /// `#main` return in a single-field schema `Ret { value: T }`,
2548    /// so this vector has exactly one entry. Empty under
2549    /// [`EmittedEntryShape::FastInt`].
2550    pub return_fields: Vec<EmittedField>,
2551    /// Fixed-area byte size of the input record. The buffer-protocol
2552    /// binding allocates `in_len = main_root_size + tail_len_for_strings`
2553    /// bytes. Zero under [`EmittedEntryShape::FastInt`].
2554    pub main_root_size: u32,
2555    /// Fixed-area byte size of the return record. The buffer-protocol
2556    /// binding reserves at least this much in the output region.
2557    /// Zero under [`EmittedEntryShape::FastInt`].
2558    pub return_root_size: u32,
2559    /// Whether the return schema contains pointer-indirect leaves
2560    /// (`String` / `List*`) — drives the binding's tail-cap sizing.
2561    pub return_has_tail: bool,
2562    /// Const-pool blob the JIT body references through arena-relative
2563    /// i32 offsets (`Op::ConstString` records). The binding copies
2564    /// this verbatim to `arena[..const_data.len()]` before every
2565    /// dispatch. Empty under [`EmittedEntryShape::FastInt`] (the fast
2566    /// path doesn't touch the const pool).
2567    pub const_data: Vec<u8>,
2568    /// `true` when the emitted body references a host shim that lives
2569    /// in the `relon-rs-shims` staticlib (`relon_llvm_str_contains_arena`
2570    /// or Wave B's `relon_llvm_f64_to_str`). Build.rs uses this to
2571    /// decide whether to add that staticlib to the linker invocation.
2572    /// The historical name predates the second shim; semantically it
2573    /// means "needs the rs-shims staticlib".
2574    pub references_str_contains_shim: bool,
2575}
2576
2577impl LlvmAotEvaluator {
2578    /// AOT entry: compile `src` into a relocatable ELF object file
2579    /// suitable for linker consumption (build.rs path).
2580    ///
2581    /// Phase 2 envelope:
2582    ///
2583    /// - When the source qualifies for the dispatch-boundary fast
2584    ///   path (Int-only `#main(Int...) -> Int`, arity <= 8, no
2585    ///   pointer-indirect leaves, no stdlib call overhead), the
2586    ///   emitted symbol carries the typed
2587    ///   `extern "C" fn(i64, ...) -> i64` shape — the Phase 1 trivial
2588    ///   path. No `SandboxState`, no const-pool, no shim
2589    ///   dependency.
2590    /// - Otherwise the symbol carries the full buffer-protocol entry
2591    ///   shape `extern "C" fn(*const ArenaState, i32, i32, i32, i32,
2592    ///   i64) -> i32`. The build.rs binding generator routes typed
2593    ///   Rust args through `relon-rs-shims::call_buffer_entry` to
2594    ///   marshal them into / out of the arena.
2595    ///
2596    /// In both modes the emitter returns an [`EmitObjectInfo`] that
2597    /// carries the metadata the binding generator needs (entry shape,
2598    /// schema field offsets, const-pool blob, shim reference flag).
2599    ///
2600    /// Returns [`LlvmError::UnsupportedSignature`] when the declared
2601    /// `#main` signature mixes types Phase 2 hasn't wired marshalling
2602    /// for yet (`Float`, `List*`, nested schemas as args, closure
2603    /// returns) — Phase 3 widens the surface.
2604    pub fn emit_object(
2605        src: &str,
2606        entry_symbol: &str,
2607        out_path: &Path,
2608    ) -> Result<EmitObjectInfo, LlvmError> {
2609        // Thin wrapper preserving the historical 3-arg signature the
2610        // rs-build `emit_all` calls (Stage 2 keeps this call site
2611        // stable). Default options (no host `#native` declarations) +
2612        // open-world dispatch — byte-identical to the pre-S2.⑤ path.
2613        let options = relon_analyzer::AnalyzeOptions {
2614            strict_mode: false,
2615            ..Default::default()
2616        };
2617        Self::emit_object_with_options(
2618            src,
2619            entry_symbol,
2620            out_path,
2621            &options,
2622            WorldMode::OpenWorld,
2623            None,
2624        )
2625    }
2626
2627    /// Stage 2.⑤ options-carrying object-emit seam.
2628    ///
2629    /// Threads a caller-supplied [`relon_analyzer::AnalyzeOptions`] (so
2630    /// host `#native` declarations resolve — the W1-C capability-gate
2631    /// e2e enabler) and a [`WorldMode`] through the object-emit path.
2632    ///
2633    /// - [`WorldMode::OpenWorld`] (the [`Self::emit_object`] default):
2634    ///   `Op::CallNative` lowers to the dynamic `relon_llvm_call_native`
2635    ///   helper. `host_shim_src` is ignored.
2636    /// - [`WorldMode::ClosedWorld`]: `Op::CallNative` lowers to a direct
2637    ///   `call @<host_symbol>`; `host_shim_src` (the `#[no_mangle]
2638    ///   extern "C"` host crate) is compiled to LLVM-18 bitcode, linked
2639    ///   into the emitted module, force-inlined, and folded by O3 — so
2640    ///   every native call collapses to the host fn body in the `.o`.
2641    ///   A `None` shim on the closed-world path is an error when the
2642    ///   source actually imports a host fn.
2643    pub fn emit_object_with_options(
2644        src: &str,
2645        entry_symbol: &str,
2646        out_path: &Path,
2647        options: &relon_analyzer::AnalyzeOptions,
2648        world_mode: WorldMode,
2649        host_shim_src: Option<&str>,
2650    ) -> Result<EmitObjectInfo, LlvmError> {
2651        // Default target is the host (native x86-64 ELF). S3.X adds the
2652        // wasm32 retarget via `emit_object_for_target`.
2653        Self::emit_object_for_target(
2654            src,
2655            entry_symbol,
2656            out_path,
2657            options,
2658            world_mode,
2659            host_shim_src,
2660            CodegenTarget::Native,
2661        )
2662    }
2663
2664    /// S3.X object-emit seam parameterised by [`CodegenTarget`].
2665    ///
2666    /// `CodegenTarget::Native` is byte-identical to the historical
2667    /// [`Self::emit_object_with_options`] path. `CodegenTarget::Wasm32`
2668    /// runs the SAME relon-IR → LLVM-IR emitter but constructs a
2669    /// `wasm32-wasi` `TargetMachine` (+ stamps the module's wasm32
2670    /// triple / DataLayout) so `write_to_file` emits a `\0asm` object
2671    /// instead of an ELF `.o`. The lowered body is unchanged — `mem.rs`
2672    /// already lays the arena out via pointer-width-agnostic i32-offset
2673    /// GEPs.
2674    ///
2675    /// Wasm32 supports both worlds (P3 §2.2). Open-world routes every
2676    /// `#native` host fn through a WASI import. Closed-world co-compiles
2677    /// the **pure-compute** host fns into the wasm unit and inlines them
2678    /// (via `link_and_inline_host_shim_wasm_pure_only`), while still
2679    /// routing **effectful** (capability-gated) host fns through WASI
2680    /// imports — symmetric with the native closed-world inline.
2681    #[allow(clippy::too_many_arguments)]
2682    pub fn emit_object_for_target(
2683        src: &str,
2684        entry_symbol: &str,
2685        out_path: &Path,
2686        options: &relon_analyzer::AnalyzeOptions,
2687        world_mode: WorldMode,
2688        host_shim_src: Option<&str>,
2689        target: CodegenTarget,
2690    ) -> Result<EmitObjectInfo, LlvmError> {
2691        let (ir, main_schema, return_schema) = Self::lower_source_with_options(src, Some(options))?;
2692        let main_layout = relon_eval_api::layout::SchemaLayout::offsets_for(&main_schema)
2693            .map_err(|e| LlvmError::Codegen(format!("main schema layout: {e}")))?;
2694        let return_layout = relon_eval_api::layout::SchemaLayout::offsets_for(&return_schema)
2695            .map_err(|e| LlvmError::Codegen(format!("return schema layout: {e}")))?;
2696        let param_names: Vec<String> = main_schema.fields.iter().map(|f| f.name.clone()).collect();
2697        let schema = BufferSchema {
2698            main_schema,
2699            return_schema,
2700            main_layout,
2701            return_layout,
2702        };
2703
2704        // Materialise the per-field metadata up-front so we can hand
2705        // it back regardless of whether we end up on the fast or
2706        // buffer-protocol path. Surfaces an `UnsupportedSignature`
2707        // for type tags Phase 2 hasn't wired marshalling for yet —
2708        // the build.rs binding side can't generate a Rust wrapper
2709        // for an unknown leaf type.
2710        //
2711        // This strict projection only matters to the **build.rs binding
2712        // generator**, which consumes `main_fields` / `return_fields` to
2713        // stamp the typed Rust wrapper — that path is `Native` only. The
2714        // `Wasm32` target feeds the **wasm-evaluator host**, which packs
2715        // its input and decodes its return through `wasm_buffer_plan` /
2716        // `wasm_buffer_decode` (driven by the full `BufferSchema`), never
2717        // these erased descriptors. So a `#main` carrying a pointer-array
2718        // list param/return the binding can't marshal (e.g. an in-place
2719        // `List<List<scalar>>` / `List<String>` / `List<Schema>` identity)
2720        // must still emit a runnable wasm body. We therefore only enforce
2721        // the binding-marshallability gate on `Native`; on `Wasm32` an
2722        // unbindable leaf yields an empty descriptor vec (the wasm host
2723        // ignores it) rather than aborting the emit.
2724        let descriptors_strict = matches!(target, CodegenTarget::Native);
2725        let (main_fields, return_fields) = if descriptors_strict {
2726            (
2727                lower_field_descriptors(&schema.main_schema, &schema.main_layout)?,
2728                lower_field_descriptors(&schema.return_schema, &schema.return_layout)?,
2729            )
2730        } else {
2731            (
2732                lower_field_descriptors(&schema.main_schema, &schema.main_layout)
2733                    .unwrap_or_default(),
2734                lower_field_descriptors(&schema.return_schema, &schema.return_layout)
2735                    .unwrap_or_default(),
2736            )
2737        };
2738
2739        let entry_idx = ir
2740            .entry_func_index
2741            .ok_or_else(|| LlvmError::Codegen("IR module has no entry function".into()))?;
2742        let entry = &ir.funcs[entry_idx];
2743
2744        // Verify the IR carries the canonical buffer-protocol entry
2745        // signature. `lower_workspace_single` always produces this
2746        // shape today; failing the check means an IR-layer change
2747        // slipped past the test gates.
2748        if !crate::codegen::is_buffer_protocol_signature(&entry.params, entry.ret) {
2749            return Err(LlvmError::UnsupportedSignature(
2750                "relon-rs build: lowering produced a non-buffer entry shape".into(),
2751            ));
2752        }
2753
2754        // Fast-path eligibility — Int-only schema, arity <= 8, no
2755        // pointer-indirect leaves. Sources that don't qualify drop to
2756        // the buffer-protocol path below.
2757        //
2758        // Stage 2.⑤: the closed-world path always takes the buffer
2759        // entry — `Op::CallNative` needs the `*state` pointer only the
2760        // buffer entry threads (the fast entry has no state slot). An
2761        // Int-only `#main` that calls a host fn would otherwise match
2762        // the fast profile and emit an entry the native-dispatch
2763        // lowering rejects. Force buffer mode for closed-world.
2764        let fast_profile = match world_mode {
2765            WorldMode::ClosedWorld => None,
2766            // P3 §2.2: a module that calls a `#native` host fn must take
2767            // the buffer entry even when its `#main` schema is Int-only
2768            // and would otherwise match the fast profile — `Op::CallNative`
2769            // / the preceding `Op::CheckCap` need the `*state` pointer and
2770            // the trailing `caps` slot only the buffer entry threads (the
2771            // fast `(i64..)->i64` entry has neither). Same reasoning the
2772            // closed-world arm uses to force buffer mode.
2773            WorldMode::OpenWorld if !ir.imports.is_empty() => None,
2774            WorldMode::OpenWorld => build_fast_path_profile(&schema).ok(),
2775        };
2776
2777        let ctx = Context::create();
2778        let module = ctx.create_module("relon_rs_object");
2779
2780        // Phase E.1 const-pool blob; needed by buffer-protocol bodies
2781        // for `Op::ConstString { idx }` resolution. The fast path
2782        // doesn't reference the pool (Int-only bodies have no
2783        // ConstString ops) so the blob ends up empty in that branch.
2784        let const_pool = ConstPool::from_module(&ir)?;
2785
2786        // Phase D fast-entry eligibility is decided from the `#main`
2787        // schema alone (Int args, single-Int return). That envelope is
2788        // necessary but not sufficient: a fast-qualifying schema can
2789        // still wrap a body that touches ops the `(i64..) -> i64` fast
2790        // entry can't lower — most notably `Op::ConstString` /
2791        // `Op::ConstList*`, which resolve against the arena-prefix
2792        // const-pool the fast entry has no state pointer to reach (it
2793        // emits with an empty pool). W4
2794        // (`range(n).map(=>"axb").filter(s.contains("x")).len()`) is the
2795        // canonical case: an `Int -> Int` schema over a string-literal
2796        // body. The in-process MCJIT path (`from_ir_inner_world`) emits
2797        // the buffer entry first and treats a failed fast-entry emit as
2798        // a soft "no fast path", rolling the fast entry back and keeping
2799        // the buffer entry. The object-emit path historically emitted
2800        // *only* the fast entry, so the same body hard-failed here with
2801        // a `missing const-pool entry`. Mirror MCJIT: try the fast entry
2802        // first, and on emit failure fall through to the buffer entry
2803        // (which lowers `Op::ConstString` against the real const-pool).
2804        let fast_profile = match fast_profile {
2805            // W7 recursive-closure Dict: a module that declares lambdas
2806            // (`#internal fib: (k) => ... fib(...)`) can match the fast
2807            // `(i64..) -> i64` envelope (Int `#main`, single-Int `result`
2808            // field) yet its body emits `Op::MakeClosure` /
2809            // `Op::CallClosure`, which resolve a lambda FunctionValue from
2810            // the module-wide `closure_fn_table`. The fast-only object-emit
2811            // branch emits *only* the fast entry with empty helper / closure
2812            // tables (it never declares + emits the lambda bodies), so
2813            // `MakeClosure fn_table_idx=N` hits an empty table. The buffer
2814            // path routes through `emit_module_funcs`, which declares every
2815            // lambda up-front (forward reference for `fib`'s self-call) and
2816            // emits each lambda body — the only place closures lower
2817            // correctly for static object emit. Force the buffer entry
2818            // whenever the module declares any lambda. The in-process MCJIT
2819            // path (`from_ir_inner_world`) already gets this for free: it
2820            // emits the buffer module first (lambdas declared + emitted) and
2821            // only *adds* a fast entry on top, reusing the populated table.
2822            Some(profile) if fast_entry_emittable(entry) && ir.closure_table.is_empty() => {
2823                Some(profile)
2824            }
2825            _ => None,
2826        };
2827
2828        let (shape, references_str_contains_shim) = match fast_profile {
2829            Some(ref profile) => {
2830                // Fast-path entry only. Same shape the Phase 1 trivial
2831                // demo path emitted — pure i64 in / i64 out, no
2832                // SandboxState pointer, no const-pool copy.
2833                //
2834                // Phase D.2: the W7 anon-Dict-return shape needs the
2835                // module-wide helper / closure tables so the fast entry
2836                // can resolve in-body `Op::Call` / `Op::CallClosure`
2837                // sites. Empty tables are fine for Phase D.1's pure
2838                // Int-arithmetic bodies (W1) — the emitter just never
2839                // looks them up.
2840                let helper_table: HashMap<u32, FunctionValue<'_>> = HashMap::new();
2841                let closure_fn_table: Vec<FunctionValue<'_>> = Vec::new();
2842                let llvm_fn = emit_fast_entry(
2843                    &ctx,
2844                    &module,
2845                    entry,
2846                    profile,
2847                    &helper_table,
2848                    &closure_fn_table,
2849                )?;
2850                llvm_fn.as_global_value().set_name(entry_symbol);
2851                llvm_fn.set_linkage(Linkage::External);
2852                (EmittedEntryShape::FastInt, false)
2853            }
2854            None => {
2855                // Buffer-protocol entry. Routes through
2856                // `emit_module_funcs` so user-defined helper functions
2857                // and bundled-stdlib bodies (Phase 2 P1 surface) lower
2858                // alongside the entry.
2859                let buffer_return_size = schema.return_layout.root_size as u32;
2860                let lambda_ir_idx_set: std::collections::HashSet<u32> =
2861                    ir.closure_table.iter().copied().collect();
2862                let helpers: Vec<&relon_ir::ir::Func> = ir
2863                    .funcs
2864                    .iter()
2865                    .enumerate()
2866                    .filter(|(i, _)| *i != entry_idx && !lambda_ir_idx_set.contains(&(*i as u32)))
2867                    .map(|(_, f)| f)
2868                    .collect();
2869                let helper_ir_indices: Vec<u32> = ir
2870                    .funcs
2871                    .iter()
2872                    .enumerate()
2873                    .filter(|(i, _)| *i != entry_idx && !lambda_ir_idx_set.contains(&(*i as u32)))
2874                    .map(|(i, _)| i as u32)
2875                    .collect();
2876                let lambdas: Vec<&relon_ir::ir::Func> = ir
2877                    .closure_table
2878                    .iter()
2879                    .map(|&ir_idx| &ir.funcs[ir_idx as usize])
2880                    .collect();
2881                // Stage 2.⑤ / P3 §2.2: pick the dispatch emitter by world
2882                // mode + target. Native open-world (default / rs-build
2883                // today) keeps the dynamic `relon_llvm_call_native` hop;
2884                // native closed-world lowers `Op::CallNative` to a direct
2885                // `call @<host>` that the host-bitcode link + inline below
2886                // folds away. wasm32 open-world lowers `Op::CallNative` to a
2887                // **wasm import** call (`crate::wasi_host`). wasm32
2888                // closed-world (P3 §2.2 co-compile) inlines the
2889                // **pure-compute** host fns into the wasm unit while routing
2890                // **effectful** ones (capability-gated) through wasm imports
2891                // — `effectful_imports` carries the per-import split derived
2892                // from the IR's CheckCap shape.
2893                let effectful_imports = compute_effectful_imports(&ir);
2894                let llvm_fn = match (world_mode, target) {
2895                    (WorldMode::ClosedWorld, CodegenTarget::Wasm32) => {
2896                        emit_module_funcs_closed_world_wasm(
2897                            &ctx,
2898                            &module,
2899                            entry,
2900                            buffer_return_size,
2901                            &const_pool,
2902                            &helpers,
2903                            Some(&helper_ir_indices),
2904                            &lambdas,
2905                            &ir.closure_table,
2906                            &ir.imports,
2907                            &effectful_imports,
2908                        )?
2909                        .0
2910                    }
2911                    (world_mode, target) => {
2912                        let emit = match (world_mode, target) {
2913                            (WorldMode::OpenWorld, CodegenTarget::Wasm32) => emit_module_funcs_wasm,
2914                            (WorldMode::OpenWorld, CodegenTarget::Native) => emit_module_funcs,
2915                            (WorldMode::ClosedWorld, _) => emit_module_funcs_closed_world,
2916                        };
2917                        emit(
2918                            &ctx,
2919                            &module,
2920                            entry,
2921                            buffer_return_size,
2922                            &const_pool,
2923                            &helpers,
2924                            Some(&helper_ir_indices),
2925                            &lambdas,
2926                            &ir.closure_table,
2927                            &ir.imports,
2928                        )?
2929                        .0
2930                    }
2931                };
2932                // Rename the canonical buffer entry to the build.rs-
2933                // supplied symbol and force external linkage so the
2934                // consuming binary's linker can resolve it.
2935                llvm_fn.as_global_value().set_name(entry_symbol);
2936                llvm_fn.set_linkage(Linkage::External);
2937
2938                // Closed-world: link the host shim bitcode into THIS
2939                // module + force-inline every imported host fn so the
2940                // direct `call @<host>` sites collapse to the host body.
2941                // Reuses the `crate::cocompile` link/inline orchestration.
2942                // Native links the host shim built for the host triple;
2943                // wasm32 links the host shim built for
2944                // `wasm32-unknown-unknown` so the inlined body matches the
2945                // wasm unit's pointer width. Either way only the
2946                // pre-declared (pure) host fns carry a direct `call @<host>`
2947                // to fold — effectful imports stay as wasm imports.
2948                if matches!(world_mode, WorldMode::ClosedWorld) {
2949                    let shim = host_shim_src.ok_or_else(|| {
2950                        LlvmError::Codegen(
2951                            "emit_object_with_options: ClosedWorld requires a host_shim_src \
2952                             (the #[no_mangle] extern \"C\" host crate to link + inline)"
2953                                .into(),
2954                        )
2955                    })?;
2956                    match target {
2957                        CodegenTarget::Wasm32 => {
2958                            crate::cocompile::link_and_inline_host_shim_wasm_pure_only(
2959                                &module,
2960                                shim,
2961                                &ir.imports,
2962                                &effectful_imports,
2963                            )?;
2964                        }
2965                        CodegenTarget::Native => {
2966                            crate::cocompile::link_and_inline_host_shim(
2967                                &module,
2968                                shim,
2969                                &ir.imports,
2970                            )?;
2971                        }
2972                    }
2973                }
2974
2975                // Detect whether the emitted module references any
2976                // host shim that lives in the `relon-rs-shims`
2977                // staticlib (`relon_llvm_str_contains_arena`, Wave B's
2978                // `relon_llvm_f64_to_str`) — drives build.rs's decision
2979                // to add that staticlib to the linker invocation. We
2980                // check by name lookup against the LLVM module since
2981                // the emit pass declares each extern lazily on its
2982                // first call site.
2983                let needs_shim = module
2984                    .get_function(RELON_LLVM_STR_CONTAINS_ARENA_SYMBOL)
2985                    .is_some()
2986                    || module
2987                        .get_function(crate::str_helpers::RELON_LLVM_F64_TO_STR_SYMBOL)
2988                        .is_some();
2989                (EmittedEntryShape::Buffer, needs_shim)
2990            }
2991        };
2992
2993        module.verify().map_err(|e| {
2994            LlvmError::Codegen(format!("LLVM verifier rejected object module: {e}"))
2995        })?;
2996
2997        // Construct the object-emit `TargetMachine` for the requested
2998        // target up front so the same machine drives both the O3
2999        // pipeline and the backend codegen below.
3000        let (machine, target_triple) = create_object_target_machine(target)?;
3001
3002        // Stamp the module's triple + DataLayout so the lowered pointer
3003        // width / endianness match the machine. Native inherits the
3004        // host triple LLVM already uses; wasm32 needs the explicit
3005        // `wasm32-wasi` triple + 32-bit DataLayout or the
3006        // verifier/codegen would default to the host's 64-bit layout.
3007        // Pulling the DataLayout straight from the machine's target data
3008        // keeps it authoritative for whichever target we built.
3009        module.set_triple(&TargetTriple::create(&target_triple));
3010        module.set_data_layout(&machine.get_target_data().get_data_layout());
3011
3012        match target {
3013            CodegenTarget::Native => {
3014                // Stamp the host CPU onto every function so the
3015                // per-function subtarget matches the host `TargetMachine`.
3016                // Keeps the AOT and MCJIT paths consistent.
3017                stamp_host_target_attributes(&module);
3018                // Host-targeted O3 (same pipeline the JIT path uses).
3019                run_default_o3_pipeline(&module)?;
3020            }
3021            CodegenTarget::Wasm32 => {
3022                // No host-CPU stamping (x86 features are meaningless for
3023                // wasm and would mis-narrow lowering). Run O3 against the
3024                // wasm32 machine so the middle-end optimises for the wasm
3025                // target's DataLayout.
3026                let opts = PassBuilderOptions::create();
3027                module
3028                    .run_passes("default<O3>", &machine, opts)
3029                    .map_err(|e| LlvmError::Codegen(format!("wasm32 run_passes O3: {e}")))?;
3030            }
3031        }
3032
3033        if let Some(parent) = out_path.parent() {
3034            if !parent.as_os_str().is_empty() {
3035                std::fs::create_dir_all(parent)
3036                    .map_err(|e| LlvmError::Codegen(format!("create out dir `{parent:?}`: {e}")))?;
3037            }
3038        }
3039        machine
3040            .write_to_file(&module, FileType::Object, out_path)
3041            .map_err(|e| LlvmError::Codegen(format!("write object `{out_path:?}`: {e}")))?;
3042
3043        // For the fast path the binding's arity matches the LLVM
3044        // entry signature's i64-slot count. For the buffer path
3045        // there's no per-Rust-arg correspondence with the LLVM
3046        // signature (which is always 6 slots), so we report the
3047        // user-visible `#main` arity instead.
3048        let entry_arity = main_fields.len();
3049        let main_root_size = schema.main_layout.root_size as u32;
3050        let return_root_size = schema.return_layout.root_size as u32;
3051        let return_has_tail = return_needs_tail_region(&schema.return_schema);
3052        let const_data = match shape {
3053            EmittedEntryShape::FastInt => Vec::new(),
3054            EmittedEntryShape::Buffer => const_pool.bytes,
3055        };
3056        let (main_fields_out, return_fields_out, main_root_size_out, return_root_size_out) =
3057            match shape {
3058                EmittedEntryShape::FastInt => (Vec::new(), Vec::new(), 0, 0),
3059                EmittedEntryShape::Buffer => {
3060                    (main_fields, return_fields, main_root_size, return_root_size)
3061                }
3062            };
3063
3064        Ok(EmitObjectInfo {
3065            entry_symbol: entry_symbol.to_string(),
3066            entry_arity,
3067            param_names,
3068            shape,
3069            main_fields: main_fields_out,
3070            return_fields: return_fields_out,
3071            main_root_size: main_root_size_out,
3072            return_root_size: return_root_size_out,
3073            return_has_tail: matches!(shape, EmittedEntryShape::Buffer) && return_has_tail,
3074            const_data,
3075            references_str_contains_shim,
3076        })
3077    }
3078}
3079
3080/// Walk a `(Schema, OffsetTable)` pair and project the per-field
3081/// declaration into the build.rs-visible [`EmittedField`] shape. The
3082/// type tag is erased into [`EmittedFieldType`] for the Phase 2
3083/// supported leaf set; any unsupported leaf surfaces as
3084/// [`LlvmError::UnsupportedSignature`] so build.rs never generates a
3085/// binding it can't compile.
3086fn lower_field_descriptors(
3087    schema: &relon_eval_api::schema_canonical::Schema,
3088    layout: &relon_eval_api::layout::OffsetTable,
3089) -> Result<Vec<EmittedField>, LlvmError> {
3090    let mut out = Vec::with_capacity(schema.fields.len());
3091    for (i, f) in schema.fields.iter().enumerate() {
3092        let lo = layout.fields.get(i).ok_or_else(|| {
3093            LlvmError::Codegen(format!(
3094                "lower_field_descriptors: layout missing slot for field `{}`",
3095                f.name
3096            ))
3097        })?;
3098        if lo.name != f.name {
3099            return Err(LlvmError::Codegen(format!(
3100                "lower_field_descriptors: schema/layout name mismatch at slot {i}: schema=`{}`, layout=`{}`",
3101                f.name, lo.name
3102            )));
3103        }
3104        let ty = emitted_field_type_for(&f.ty).ok_or_else(|| {
3105            LlvmError::UnsupportedSignature(format!(
3106                "relon-rs build (Phase 2): field `{}` type {:?} not yet wired for marshalling",
3107                f.name, f.ty
3108            ))
3109        })?;
3110        out.push(EmittedField {
3111            name: f.name.clone(),
3112            offset: lo.offset as u32,
3113            ty,
3114        });
3115    }
3116    Ok(out)
3117}
3118
3119/// Project one canonical [`TypeRepr`] onto the build.rs-visible
3120/// [`EmittedFieldType`] tag, or `None` when the leaf isn't yet wired for
3121/// AOT-binding marshalling.
3122///
3123/// This is the per-variant accept-set table for the
3124/// [`EmittedFieldType`] triple's codegen end. To widen the AOT signature
3125/// surface (e.g. Float / List lanes), add the matching arm here — the
3126/// `None` fall-through keeps every still-unsupported leaf surfacing as
3127/// `UnsupportedSignature` rather than silently emitting a tag the shim
3128/// can't decode.
3129fn emitted_field_type_for(
3130    ty: &relon_eval_api::schema_canonical::TypeRepr,
3131) -> Option<EmittedFieldType> {
3132    use relon_eval_api::schema_canonical::TypeRepr;
3133    match ty {
3134        TypeRepr::Int => Some(EmittedFieldType::Int),
3135        TypeRepr::Float => Some(EmittedFieldType::Float),
3136        TypeRepr::Bool => Some(EmittedFieldType::Bool),
3137        TypeRepr::Unit => Some(EmittedFieldType::Unit),
3138        TypeRepr::String => Some(EmittedFieldType::String),
3139        TypeRepr::List { element } if matches!(element.as_ref(), TypeRepr::Int) => {
3140            Some(EmittedFieldType::ListInt)
3141        }
3142        // ----- add new AOT-marshallable leaf type above this line -----
3143        _ => None,
3144    }
3145}
3146
3147/// Stamp the runtime host CPU/feature set onto every function in the
3148/// module as `"target-cpu"` / `"target-features"` string function
3149/// attributes.
3150///
3151/// ## Why this exists (correctness, not a micro-opt)
3152///
3153/// The MCJIT execution engine is created without an MCPU/MAttr —
3154/// `MCJITCompilerOptions` exposes no CPU field, and inkwell's
3155/// `create_*_execution_engine*` builders take only an
3156/// [`OptimizationLevel`] (+ a `CodeModel` on the memory-manager
3157/// variant). With no CPU pinned, the X86 backend lowers for **generic
3158/// x86-64** and drops every host-tuning decision the per-CPU
3159/// `SubtargetFeatures` would have enabled. The one that bites hardest:
3160/// the `SlowDivide64` tuning that narrows a 64-bit `idivq` whose
3161/// operands provably fit in 32 bits into the host `shrq $32; je; divl`
3162/// fast path. Generic codegen always emits the bare microcoded
3163/// `idivq`, so every i64 `%` / `/` runs the slow divider at runtime.
3164///
3165/// The `default<O3>` middle-end pipeline already runs against a host
3166/// `TargetMachine` (see [`run_default_o3_pipeline`]) and the static
3167/// object-emit path bakes the host CPU into its `TargetMachine` too,
3168/// so both of those already lower for the host. Only the **MCJIT
3169/// backend codegen** was generic. LLVM resolves a function's subtarget
3170/// from its `"target-cpu"` / `"target-features"` string attributes
3171/// when present, so stamping the host values here makes the MCJIT
3172/// backend lower each function for the CPU it will actually run on —
3173/// identical results, correct host instruction selection.
3174///
3175/// The CPU/features are queried from the running host
3176/// ([`TargetMachine::get_host_cpu_name`] /
3177/// [`TargetMachine::get_host_cpu_features`]) — the SAME source the O3
3178/// pipeline uses — so this is correct on any machine and never pins a
3179/// hard-coded microarchitecture.
3180fn stamp_host_target_attributes(module: &inkwell::module::Module<'_>) {
3181    // `get_host_cpu_*` reads the running CPU via LLVM's host
3182    // introspection; no native-target init is required for these two
3183    // queries, but every caller has already initialised the native
3184    // target by this point (verify -> O3 -> engine).
3185    let cpu = TargetMachine::get_host_cpu_name();
3186    let features = TargetMachine::get_host_cpu_features();
3187    let cpu = cpu.to_str().unwrap_or("");
3188    let features = features.to_str().unwrap_or("");
3189    if cpu.is_empty() {
3190        // Host introspection failed; leave the module generic rather
3191        // than stamping an empty/bogus CPU. The engine still works,
3192        // just without host narrowing (the pre-fix behaviour).
3193        return;
3194    }
3195    let ctx = module.get_context();
3196    let cpu_attr = ctx.create_string_attribute("target-cpu", cpu);
3197    let features_attr = ctx.create_string_attribute("target-features", features);
3198    let mut func = module.get_first_function();
3199    while let Some(f) = func {
3200        // Only stamp functions with a body. Pure declarations (the
3201        // `relon_llvm_str_contains_arena` host shim, intrinsics) have
3202        // no IR to lower, and stamping a target-cpu on an external
3203        // declaration is harmless but pointless.
3204        if f.count_basic_blocks() > 0 {
3205            // Idempotent: replace any pre-existing stamp so a re-run
3206            // (or an emitter that already set one) lands on the host.
3207            f.remove_string_attribute(inkwell::attributes::AttributeLoc::Function, "target-cpu");
3208            f.remove_string_attribute(
3209                inkwell::attributes::AttributeLoc::Function,
3210                "target-features",
3211            );
3212            f.add_attribute(inkwell::attributes::AttributeLoc::Function, cpu_attr);
3213            f.add_attribute(inkwell::attributes::AttributeLoc::Function, features_attr);
3214        }
3215        func = f.get_next_function();
3216    }
3217}
3218
3219fn run_default_o3_pipeline(module: &inkwell::module::Module<'_>) -> Result<(), LlvmError> {
3220    Target::initialize_native(&InitializationConfig::default())
3221        .map_err(|e| LlvmError::Codegen(format!("initialize_native: {e}")))?;
3222    let triple_str = TargetMachine::get_default_triple();
3223    let target = Target::from_triple(&triple_str)
3224        .map_err(|e| LlvmError::Codegen(format!("target from_triple: {e}")))?;
3225    let cpu = TargetMachine::get_host_cpu_name();
3226    let features = TargetMachine::get_host_cpu_features();
3227    let triple = TargetTriple::create(
3228        triple_str
3229            .as_str()
3230            .to_str()
3231            .map_err(|e| LlvmError::Codegen(format!("triple utf8: {e}")))?,
3232    );
3233    let machine = target
3234        .create_target_machine(
3235            &triple,
3236            cpu.to_str().unwrap_or(""),
3237            features.to_str().unwrap_or(""),
3238            OptimizationLevel::Aggressive,
3239            RelocMode::Default,
3240            CodeModel::JITDefault,
3241        )
3242        .ok_or_else(|| LlvmError::Codegen("create_target_machine returned null".into()))?;
3243    let opts = PassBuilderOptions::create();
3244    module
3245        .run_passes("default<O3>", &machine, opts)
3246        .map_err(|e| LlvmError::Codegen(format!("run_passes O3: {e}")))?;
3247    Ok(())
3248}
3249
3250/// Build the object-emit `TargetMachine` for the requested
3251/// [`CodegenTarget`]. Native bakes the host CPU/features + PIC reloc;
3252/// Wasm32 initialises the WebAssembly backend and pins the
3253/// `wasm32-wasi` triple. The triple String returned alongside lets the
3254/// caller stamp the module's target-triple (the DataLayout is pulled
3255/// from the machine's target data) so the wasm object's pointer width /
3256/// endianness match the machine.
3257fn create_object_target_machine(
3258    target: CodegenTarget,
3259) -> Result<(TargetMachine, String), LlvmError> {
3260    match target {
3261        CodegenTarget::Native => {
3262            Target::initialize_native(&InitializationConfig::default())
3263                .map_err(|e| LlvmError::Codegen(format!("initialize_native: {e}")))?;
3264            let triple_str = TargetMachine::get_default_triple();
3265            let t = Target::from_triple(&triple_str)
3266                .map_err(|e| LlvmError::Codegen(format!("target from_triple: {e}")))?;
3267            let cpu = TargetMachine::get_host_cpu_name();
3268            let features = TargetMachine::get_host_cpu_features();
3269            let triple = TargetTriple::create(
3270                triple_str
3271                    .as_str()
3272                    .to_str()
3273                    .map_err(|e| LlvmError::Codegen(format!("triple utf8: {e}")))?,
3274            );
3275            let machine = t
3276                .create_target_machine(
3277                    &triple,
3278                    cpu.to_str().unwrap_or(""),
3279                    features.to_str().unwrap_or(""),
3280                    OptimizationLevel::Aggressive,
3281                    RelocMode::PIC,
3282                    CodeModel::Default,
3283                )
3284                .ok_or_else(|| LlvmError::Codegen("create_target_machine returned null".into()))?;
3285            let triple_owned = triple_str
3286                .as_str()
3287                .to_str()
3288                .map_err(|e| LlvmError::Codegen(format!("triple utf8: {e}")))?
3289                .to_string();
3290            Ok((machine, triple_owned))
3291        }
3292        CodegenTarget::Wasm32 => {
3293            // The WebAssembly backend lives behind the `target-webassembly`
3294            // inkwell feature; `initialize_webassembly` registers it.
3295            Target::initialize_webassembly(&InitializationConfig::default());
3296            let triple = TargetTriple::create(WASM32_TRIPLE);
3297            let t = Target::from_triple(&triple)
3298                .map_err(|e| LlvmError::Codegen(format!("wasm32 target from_triple: {e}")))?;
3299            // No host-CPU narrowing for wasm; the MVP+ feature set is
3300            // controlled by the wasm runtime (wasmtime defaults). Reloc
3301            // is irrelevant for the wasm object model — `Static`/`Default`
3302            // both produce a relocatable `\0asm` object.
3303            //
3304            // `+bulk-memory`: lower `llvm.memcpy` / `llvm.memset` to the
3305            // native `memory.copy` / `memory.fill` ops instead of a libc
3306            // `env::memcpy` import. The pointer-indirect String / List
3307            // return-store path (`emit_store_field_pointer_indirect`)
3308            // emits a `memcpy`; without bulk-memory wasm-ld leaves an
3309            // unresolved `env::memcpy` import that no standard WASI host
3310            // satisfies. wasmtime enables bulk-memory by default, so the
3311            // emitted module stays ecosystem-portable.
3312            let machine = t
3313                .create_target_machine(
3314                    &triple,
3315                    /*cpu=*/ "",
3316                    /*features=*/ "+bulk-memory",
3317                    OptimizationLevel::Aggressive,
3318                    RelocMode::Static,
3319                    CodeModel::Default,
3320                )
3321                .ok_or_else(|| {
3322                    LlvmError::Codegen("wasm32 create_target_machine returned null".into())
3323                })?;
3324            Ok((machine, WASM32_TRIPLE.to_string()))
3325        }
3326    }
3327}
relon_codegen_llvm/evaluator.rs

relon_codegen_llvm/
evaluator.rs