relon_codegen_llvm/evaluator.rs
1//! Runtime façade for the LLVM AOT backend.
2//!
3//! Phase B widens the evaluator past the bootstrap envelope:
4//!
5//! - [`LlvmAotEvaluator::from_ir_direct`] keeps the legacy-i64 entry
6//! shape (`(I64...) -> I64`) for hand-built IR fixtures and the
7//! side-by-side `from_ir_direct` benches.
8//! - [`LlvmAotEvaluator::from_source`] drives the full
9//! parse + analyze + `lower_workspace_single` + LLVM emit + JIT
10//! pipeline. Matches the cranelift backend's `from_source` shape
11//! so a host can swap the two evaluators by changing the
12//! constructor name.
13//!
14//! ## Why MCJIT (and not ORC) for Phase B
15//!
16//! - MCJIT is the simplest engine that inkwell exposes — single
17//! `create_jit_execution_engine` call, no per-symbol resolver
18//! plumbing. The Phase B goal is W1 / W2 production-source parity,
19//! not throughput.
20//! - inkwell 0.9.0 wraps both engines, so switching to ORC in
21//! Phase C is a localised diff: one call site here, the emitter
22//! stays untouched.
23//! - LLVM 18's MCJIT still handles the W1 / W2 hot path (single
24//! function, no global state, no external symbols).
25
26use std::cell::RefCell;
27use std::collections::HashMap;
28use std::sync::atomic::{AtomicI64, Ordering};
29use std::sync::Arc;
30
31use inkwell::context::Context;
32use inkwell::execution_engine::ExecutionEngine;
33use inkwell::passes::PassBuilderOptions;
34use inkwell::targets::{
35 CodeModel, InitializationConfig, RelocMode, Target, TargetMachine, TargetTriple,
36};
37use inkwell::OptimizationLevel;
38
39use relon_eval_api::inplace_return::ArenaRegions;
40use relon_eval_api::{ClosureData, Evaluator, RuntimeError, Scope, Thunk, Value};
41use relon_parser::Node;
42
43use crate::codegen::{
44 emit_fast_entry, emit_module_funcs, emit_module_funcs_closed_world,
45 emit_module_funcs_closed_world_wasm, emit_module_funcs_wasm, is_buffer_protocol_signature,
46 ConstPool, EntryShape, FastPathProfile, WorldMode, ENTRY_SYMBOL, ENTRY_SYMBOL_FAST,
47};
48use crate::error::LlvmError;
49use crate::state::ArenaState;
50use crate::str_helpers::RELON_LLVM_STR_CONTAINS_ARENA_SYMBOL;
51use inkwell::module::Linkage;
52use inkwell::targets::FileType;
53use inkwell::values::FunctionValue;
54use std::path::Path;
55
56/// Maximum positional arity supported by the Phase A legacy-i64
57/// entry. Mirrors the cranelift crate's `MAX_LEGACY_ARITY`; the four
58/// slots cover every helloworld-style body in the Phase A bootstrap
59/// + benchmarks.
60///
61/// Phase B adds the buffer-protocol path on top — that path is not
62/// arity-capped because every IR arg flows through the buffer rather
63/// than positional slots.
64pub const MAX_LEGACY_ARITY: usize = 4;
65
66/// Codegen target for the object-emit path (S3.X).
67///
68/// The SAME relon-IR → LLVM-IR emitter feeds both variants — only the
69/// `TargetMachine` construction (triple + DataLayout + CPU/features +
70/// reloc/code model) differs. `mem.rs` already lays out the arena via
71/// i32-offset GEPs (zext-i64 + `i8*` base), so the lowered body is
72/// pointer-width agnostic and needs no per-target change.
73#[derive(Debug, Clone, Copy, PartialEq, Eq)]
74pub enum CodegenTarget {
75 /// Host x86-64 ELF object (the historical default). Triple +
76 /// CPU/features come from `TargetMachine::get_default_triple` /
77 /// `get_host_cpu_*`, reloc = PIC.
78 Native,
79 /// `wasm32-wasi` object (`\0asm` magic). Uses the WebAssembly LLVM
80 /// backend with the canonical wasm32 DataLayout. Emitted object is
81 /// consumed by `wasmtime` (see `crate::wasm_run`).
82 Wasm32,
83}
84
85/// Reference: the wasm32 DataLayout string LLVM emits for
86/// `wasm32-wasi` (little-endian, 32-bit pointers, i64 8-byte aligned).
87/// The module DataLayout is set authoritatively from the
88/// `TargetMachine`'s target data at emit time; this const documents the
89/// expected shape — note the `p:32:32` that lowers the i32-offset arena
90/// GEPs to 32-bit linear-memory pointers.
91#[allow(dead_code)]
92const WASM32_DATA_LAYOUT: &str = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20";
93/// The wasm32 triple. `wasm32-wasi` so the module can later route
94/// effectful host fns through WASI imports (P3 §2.2). For pure-compute
95/// workloads `wasm32-unknown-unknown` would also work; wasi is the
96/// superset.
97const WASM32_TRIPLE: &str = "wasm32-wasi";
98
99// `extern "C"` function pointer aliases for the legacy-i64 entry.
100// Five i64 slots accept the v5-β-1 envelope's max arity; shorter
101// signatures pass zero in the trailing slots — the emitter only
102// declares the parameters the IR has, so unused trailing slots are
103// dead-on-arrival.
104type LegacyEntryFn4 = unsafe extern "C" fn(i64, i64, i64, i64) -> i64;
105type LegacyEntryFn3 = unsafe extern "C" fn(i64, i64, i64) -> i64;
106type LegacyEntryFn2 = unsafe extern "C" fn(i64, i64) -> i64;
107type LegacyEntryFn1 = unsafe extern "C" fn(i64) -> i64;
108type LegacyEntryFn0 = unsafe extern "C" fn() -> i64;
109
110/// `extern "C"` function pointer for the buffer-protocol entry. The
111/// state pointer comes first to match the cranelift backend's
112/// `BufferEntryFn` so the two evaluators share dispatch shape.
113type BufferEntryFn = unsafe extern "C" fn(
114 *const ArenaState,
115 i32, // in_ptr
116 i32, // in_len
117 i32, // out_ptr
118 i32, // out_cap
119 i64, // caps
120) -> i32;
121
122// Phase D.1 fast-path typed entries. Arity-specialised C ABI shapes
123// up to 8 args — the arity cap matches `emit_fast_entry`'s envelope.
124type FastEntryFn0 = unsafe extern "C" fn() -> i64;
125type FastEntryFn1 = unsafe extern "C" fn(i64) -> i64;
126type FastEntryFn2 = unsafe extern "C" fn(i64, i64) -> i64;
127type FastEntryFn3 = unsafe extern "C" fn(i64, i64, i64) -> i64;
128type FastEntryFn4 = unsafe extern "C" fn(i64, i64, i64, i64) -> i64;
129type FastEntryFn5 = unsafe extern "C" fn(i64, i64, i64, i64, i64) -> i64;
130type FastEntryFn6 = unsafe extern "C" fn(i64, i64, i64, i64, i64, i64) -> i64;
131type FastEntryFn7 = unsafe extern "C" fn(i64, i64, i64, i64, i64, i64, i64) -> i64;
132type FastEntryFn8 = unsafe extern "C" fn(i64, i64, i64, i64, i64, i64, i64, i64) -> i64;
133
134/// Owned LLVM JIT state for a single compiled module. The
135/// [`Context`] / [`ExecutionEngine`] pair must outlive every call
136/// into the JITted function pointer; we park them on the heap behind
137/// the evaluator so the host can ignore lifetimes.
138struct JitOwned {
139 // The `Context` must outlive the ExecutionEngine; we keep it in a
140 // pinned heap slot so the engine's borrow stays valid for the
141 // evaluator's lifetime.
142 _engine: ExecutionEngine<'static>,
143 /// Raw entry function pointer resolved once at construction time.
144 /// Cached so the hot path is a single indirect call (matches the
145 /// cranelift backend's `LegacyEntryFn` stash).
146 entry_ptr: usize,
147 /// Phase D.1: typed fast entry pointer resolved at construction
148 /// time when the source qualifies for the dispatch-boundary fast
149 /// path. `None` when the IR fails to lower against the fast
150 /// envelope (string ops, sandbox traps, etc.) — `run_main` falls
151 /// back to the buffer entry transparently in that case.
152 fast_entry_ptr: Option<usize>,
153 /// Pre-rendered textual LLVM IR. inkwell 0.9's
154 /// `ExecutionEngine::get_module*` is missing, so the dump-time
155 /// call cannot reach back to the live module — we pay the
156 /// `print_to_string` cost up-front.
157 ir_dump: String,
158 _ctx: Box<Context>,
159}
160
161// SAFETY: the inkwell ExecutionEngine + Context pair is not `Sync`
162// by default — LLVM's `LLVMContextRef` is per-thread. We mark the
163// pair Send/Sync because `run_main` only reaches back into the JIT
164// through the cached function pointers (`entry_ptr`, `fast_entry_ptr`),
165// which are immutable after construction; the only per-call mutable
166// state is the thread-local `LLVM_ARENA_POOL`, which needs no lock.
167unsafe impl Send for JitOwned {}
168unsafe impl Sync for JitOwned {}
169
170/// Buffer schema metadata captured by `from_source`. Mirrors
171/// `relon_codegen_cranelift::evaluator::BufferSchema` — kept inside this
172/// crate (rather than re-imported) so the LLVM backend stays
173/// independent.
174struct BufferSchema {
175 main_schema: relon_eval_api::schema_canonical::Schema,
176 return_schema: relon_eval_api::schema_canonical::Schema,
177 main_layout: relon_eval_api::layout::OffsetTable,
178 return_layout: relon_eval_api::layout::OffsetTable,
179}
180
181/// Phase B LLVM AOT evaluator. Either constructed from a pre-lowered
182/// IR module via [`Self::from_ir_direct`] (legacy-i64 envelope) or
183/// from a `.relon` source via [`Self::from_source`] (buffer-protocol
184/// envelope).
185pub struct LlvmAotEvaluator {
186 jit: JitOwned,
187 entry_shape: EntryShape,
188 entry_arity: usize,
189 param_names: Vec<String>,
190 /// Buffer schema for source-driven entries; `None` for direct-IR.
191 buffer_schema: Option<BufferSchema>,
192 /// Phase D.1: when `Some`, the JIT module exported a typed
193 /// `(i64...) -> i64` fast entry alongside the buffer entry. Held
194 /// here so `run_main` can pick the fast pointer when the supplied
195 /// args match the eligible shape. Length equals the fast-entry
196 /// arity (matches `buffer_schema.main_schema.fields.len()` when
197 /// every field is `Int`).
198 ///
199 /// Stored as a bare `usize`, not `Option<usize>`: the single
200 /// resolution site in `from_ir_inner_world` assigns it together
201 /// with `fast_entry_ptr` from one tuple, so "ptr present, arity
202 /// absent" is unrepresentable by construction. Only meaningful
203 /// while `jit.fast_entry_ptr` is `Some` (it is `0` and never read
204 /// otherwise — both readers gate on the pointer first). Keeping
205 /// the per-call dispatch free of an `Option` unwrap matters: a
206 /// panicking `expect` here once pushed `run_main_legacy_i64_fast`
207 /// past the LTO inline-cost threshold, de-inlining it from bench
208 /// loops and costing the W12 kernel 2.7x per call.
209 fast_path_arity: usize,
210 /// Whether the public `run_main` method may automatically choose the
211 /// typed fast entry. The fast entry has no `ArenaState` / trap-code
212 /// channel, so bodies that can raise typed runtime traps stay
213 /// callable through `run_main_legacy_i64_fast` for benchmarks but
214 /// normal host evaluation routes through the buffer entry.
215 fast_path_auto_dispatch: bool,
216 /// Phase E.1: const-data bytes the IR's `Op::ConstString` /
217 /// `Op::ConstList*` records reference through arena-relative i32
218 /// offsets. The host copies this blob into the arena prefix at
219 /// every dispatch so the JIT-emitted `iconst(I32, offset)` lands
220 /// on the right record.
221 const_data: Vec<u8>,
222 /// Phase 0b: the module's `#native` imports in `import_idx` order.
223 /// Carried so [`Self::with_host_fns`] can match a host-supplied
224 /// `Arc<dyn RelonFunction>` (keyed by source-level name) to the
225 /// `import_idx` the lowering pass assigned.
226 native_imports: Vec<relon_ir::ir::NativeImport>,
227 /// Phase 0b: host-fn registry installed on every per-call
228 /// `ArenaState` so a source-lowered `Op::CallNative` dispatches
229 /// through `relon_llvm_call_native`. Behind an `Arc` so the
230 /// registry outlives every dispatch without per-call clones; rebuilt
231 /// by [`Self::with_host_fns`]. Empty by default — an unregistered
232 /// gated call then traps after passing the `CheckCap` gate.
233 host_fns: Arc<crate::state::HostFnRegistry>,
234 /// Phase 0b: capability bitmask passed as the buffer entry's
235 /// trailing `i64 caps` param. The source-lowered `Op::CheckCap`
236 /// gate tests bit `cap_bit` of this word; `0` denies every gated
237 /// call. Set via [`Self::with_granted_cap`] / [`Self::with_caps`].
238 caps_mask: i64,
239 /// Remaining step budget installed into each per-call
240 /// [`ArenaState`]. `0` means unlimited.
241 step_budget: AtomicI64,
242}
243
244thread_local! {
245 /// Per-thread arena buffer reused across `run_main_buffer` calls
246 /// on the same thread. The pool caches the largest `arena_size`
247 /// the thread has ever requested; subsequent dispatches reuse
248 /// the allocation and only pay a targeted `fill(0)` over the
249 /// observable prefix. Mirrors the cranelift backend's
250 /// `ARENA_POOL` to keep the dispatch boundary cost comparable.
251 static LLVM_ARENA_POOL: RefCell<Vec<u8>> = const { RefCell::new(Vec::new()) };
252}
253
254fn step_budget_to_i64(steps: Option<u64>) -> i64 {
255 match steps {
256 None => 0,
257 Some(0) => -1,
258 Some(n) => i64::try_from(n).unwrap_or(i64::MAX),
259 }
260}
261
262impl LlvmAotEvaluator {
263 /// Compile a pre-lowered IR module into a JIT-resident function
264 /// pointer. Accepts either a legacy-i64 entry
265 /// (`(I64...) -> I64`) or the buffer-protocol shape
266 /// (`(I32, I32, I32, I32, I64) -> I32`); the emitter inspects the
267 /// entry signature and picks the matching wrapper.
268 ///
269 /// `param_names` parallels the cranelift backend's
270 /// `from_ir_direct` arg so the `Evaluator::run_main` dispatch
271 /// can look up positional args by their declared name. Direct-IR
272 /// callers without a schema can pass synthetic
273 /// `["arg0", "arg1", …]` names.
274 pub fn from_ir_direct(
275 ir: relon_ir::ir::Module,
276 param_names: Vec<String>,
277 ) -> Result<Self, LlvmError> {
278 Self::from_ir_inner(ir, param_names, None)
279 }
280
281 /// Drive the full `parse → analyze → lower → emit → JIT` pipeline
282 /// against a `.relon` source. Matches the cranelift backend's
283 /// `AotEvaluator::from_source` shape so hosts can swap the two
284 /// evaluators by changing the constructor.
285 ///
286 /// Phase B accepts the IR shape `lower_workspace_single` emits
287 /// for `#main` source with the W1 / W2 production envelope
288 /// (range / map / sum). Sources outside that envelope (closures
289 /// past peephole, schema-method dispatch, stdlib calls, …) fail
290 /// at the LLVM emit step with `LlvmError::Codegen`.
291 pub fn from_source(src: &str) -> Result<Self, LlvmError> {
292 Self::from_source_with_options_inner(src, None)
293 }
294
295 /// Like [`Self::from_source`] but with caller-supplied analyzer
296 /// options — the entry point for host-registered `#native` fns.
297 /// The host populates `options.host_fn_names` /
298 /// `host_fn_signatures` / `host_fn_gates` / `caps` so the analyzer
299 /// resolves the calls, runs the single-file capability-reachability
300 /// check (a gated call without the statically-granted cap fails the
301 /// build here), and the lowering pass emits the `Op::CheckCap`-
302 /// guarded `Op::CallNative`.
303 ///
304 /// The returned evaluator carries an empty host-fn registry and a
305 /// zero capability mask; chain [`Self::with_host_fns`] +
306 /// [`Self::with_granted_cap`] to wire the runtime dispatch + grant.
307 /// Mirrors the cranelift backend's `from_source_with_options`.
308 pub fn from_source_with_options(
309 src: &str,
310 options: &relon_analyzer::AnalyzeOptions,
311 ) -> Result<Self, LlvmError> {
312 Self::from_source_with_options_inner(src, Some(options))
313 }
314
315 fn from_source_with_options_inner(
316 src: &str,
317 options: Option<&relon_analyzer::AnalyzeOptions>,
318 ) -> Result<Self, LlvmError> {
319 let (ir, main_schema, return_schema) = Self::lower_source_with_options(src, options)?;
320 let main_layout = relon_eval_api::layout::SchemaLayout::offsets_for(&main_schema)
321 .map_err(|e| LlvmError::Codegen(format!("main schema layout: {e}")))?;
322 let return_layout = relon_eval_api::layout::SchemaLayout::offsets_for(&return_schema)
323 .map_err(|e| LlvmError::Codegen(format!("return schema layout: {e}")))?;
324 let param_names: Vec<String> = main_schema.fields.iter().map(|f| f.name.clone()).collect();
325 let schema = BufferSchema {
326 main_schema,
327 return_schema,
328 main_layout,
329 return_layout,
330 };
331 Self::from_ir_inner(ir, param_names, Some(schema))
332 }
333
334 fn lower_source_with_options(
335 src: &str,
336 options: Option<&relon_analyzer::AnalyzeOptions>,
337 ) -> Result<
338 (
339 relon_ir::ir::Module,
340 relon_eval_api::schema_canonical::Schema,
341 relon_eval_api::schema_canonical::Schema,
342 ),
343 LlvmError,
344 > {
345 // W7 closure-as-value (Phase F.W7): the production source
346 // `#main(Int n) -> Dict { #internal fib: (k) => ..., result: fib(n) }`
347 // trips the v1.5 / v1.6 strict-mode type-surface diagnostics
348 // (`ClosureParamTypeMissing`, `ClosureReturnTypeUnknown`,
349 // `ExpressionTypeUnknown`) even though IR lowering accepts the
350 // shape via `lower_anon_dict_body`. Run the analyzer with
351 // `strict_mode: false` so the soft bans don't gate LLVM
352 // codegen. Hard structural errors (`UnknownTypeName`,
353 // `MainReturnTypeMismatch`, etc.) still surface as `Error`-
354 // severity diagnostics under non-strict mode and still gate the
355 // build below. Unlike the Cranelift route, the LLVM backend does
356 // NOT force `standalone_capability_check`.
357 //
358 // Phase 0b: a caller-supplied `options` (host `#native` fns)
359 // takes precedence — the host already sets `strict_mode: false`
360 // on it (see the cranelift `host_options` fixture). We force
361 // `strict_mode: false` regardless so the closure surface stays
362 // unblocked even if a host left it default-true.
363 let owned;
364 let options: &relon_analyzer::AnalyzeOptions = match options {
365 Some(o) => {
366 if o.strict_mode {
367 owned = relon_analyzer::AnalyzeOptions {
368 strict_mode: false,
369 ..o.clone()
370 };
371 &owned
372 } else {
373 o
374 }
375 }
376 None => {
377 owned = relon_analyzer::AnalyzeOptions {
378 strict_mode: false,
379 ..Default::default()
380 };
381 &owned
382 }
383 };
384 // Map the shared frontend pipeline error onto this backend's
385 // surface: Parse → Parse, Analyze(n) → Analyze(n), and Lowering
386 // → Codegen with the historical `lower_workspace_single:` prefix
387 // (the LLVM backend has no dedicated `Lowering` variant).
388 let lowered = relon_ir::frontend::compile(src, options).map_err(|e| match e {
389 relon_ir::FrontendError::Parse(msg) => LlvmError::Parse(msg),
390 relon_ir::FrontendError::Analyze(n) => LlvmError::Analyze(n),
391 relon_ir::FrontendError::Lowering(msg) => {
392 LlvmError::Codegen(format!("lower_workspace_single: {msg}"))
393 }
394 })?;
395 Ok((lowered.module, lowered.main_schema, lowered.return_schema))
396 }
397
398 /// Stage 2.⑤ closed-world source constructor. Builds the
399 /// buffer-protocol JIT evaluator with `Op::CallNative` lowered to a
400 /// direct `call @<host_symbol>`, links + inlines the host shim
401 /// bitcode, and reuses the open-world arena-handshake dispatch
402 /// (`run_main`) verbatim — the entry symbol / signature are
403 /// identical, only the native-dispatch lowering differs. No host-fn
404 /// registry / cap mask is needed at runtime: the host body is folded
405 /// into the entry by the LTO inline, so there is no dynamic
406 /// `relon_llvm_call_native` hop to resolve.
407 ///
408 /// The differential oracle for this path is the open-world
409 /// `from_source_with_options` + `run_main` result (anchored, in
410 /// turn, to cranelift's `native_call_from_source`).
411 pub fn from_source_closed_world(
412 src: &str,
413 options: &relon_analyzer::AnalyzeOptions,
414 host_shim_src: &str,
415 ) -> Result<Self, LlvmError> {
416 let (ir, main_schema, return_schema) = Self::lower_source_with_options(src, Some(options))?;
417 let main_layout = relon_eval_api::layout::SchemaLayout::offsets_for(&main_schema)
418 .map_err(|e| LlvmError::Codegen(format!("main schema layout: {e}")))?;
419 let return_layout = relon_eval_api::layout::SchemaLayout::offsets_for(&return_schema)
420 .map_err(|e| LlvmError::Codegen(format!("return schema layout: {e}")))?;
421 let param_names: Vec<String> = main_schema.fields.iter().map(|f| f.name.clone()).collect();
422 let schema = BufferSchema {
423 main_schema,
424 return_schema,
425 main_layout,
426 return_layout,
427 };
428 Self::from_ir_inner_world(
429 ir,
430 param_names,
431 Some(schema),
432 WorldMode::ClosedWorld,
433 Some(host_shim_src),
434 )
435 }
436
437 fn from_ir_inner(
438 ir: relon_ir::ir::Module,
439 param_names: Vec<String>,
440 buffer_schema: Option<BufferSchema>,
441 ) -> Result<Self, LlvmError> {
442 Self::from_ir_inner_world(ir, param_names, buffer_schema, WorldMode::OpenWorld, None)
443 }
444
445 fn from_ir_inner_world(
446 ir: relon_ir::ir::Module,
447 param_names: Vec<String>,
448 buffer_schema: Option<BufferSchema>,
449 world_mode: WorldMode,
450 host_shim_src: Option<&str>,
451 ) -> Result<Self, LlvmError> {
452 let entry_idx = ir
453 .entry_func_index
454 .ok_or_else(|| LlvmError::Codegen("IR module has no entry function".into()))?;
455 let entry = &ir.funcs[entry_idx];
456
457 // Detect the shape up-front so we can validate `param_names`
458 // against the correct envelope.
459 let buffer_shape = is_buffer_protocol_signature(&entry.params, entry.ret);
460 if !buffer_shape && entry.params.len() > MAX_LEGACY_ARITY {
461 return Err(LlvmError::UnsupportedSignature(format!(
462 "llvm-aot: {} params exceeds MAX_LEGACY_ARITY={MAX_LEGACY_ARITY}",
463 entry.params.len()
464 )));
465 }
466 let declared_arity = if buffer_shape {
467 buffer_schema
468 .as_ref()
469 .map(|s| s.main_schema.fields.len())
470 .unwrap_or(0)
471 } else {
472 entry.params.len()
473 };
474 if param_names.len() != declared_arity {
475 return Err(LlvmError::UnsupportedSignature(format!(
476 "llvm-aot: param_names len {} does not match declared arity {declared_arity}",
477 param_names.len()
478 )));
479 }
480 if buffer_shape && buffer_schema.is_none() {
481 // A direct-IR caller handed in a buffer-protocol IR
482 // without schema metadata. We can still JIT-compile,
483 // but `run_main` won't be able to pack the input or
484 // decode the output. Reject up-front so the host knows.
485 return Err(LlvmError::UnsupportedSignature(
486 "llvm-aot: buffer-protocol IR requires schema metadata; use from_source".into(),
487 ));
488 }
489 if !buffer_shape && buffer_schema.is_some() {
490 return Err(LlvmError::UnsupportedSignature(
491 "llvm-aot: schema metadata supplied for non-buffer entry".into(),
492 ));
493 }
494
495 // Build the LLVM module under a per-evaluator Context. We
496 // leak the Context onto the heap and transmute the engine's
497 // lifetime to `'static` (see SAFETY note on `JitOwned`).
498 let ctx_box: Box<Context> = Box::new(Context::create());
499 // SAFETY: `ctx_box` lives on the heap and we never deallocate
500 // it before the engine.
501 let ctx_static: &'static Context = unsafe { &*(ctx_box.as_ref() as *const Context) };
502
503 let module = ctx_static.create_module("relon_llvm_aot");
504
505 // Buffer-protocol entries return `bytes_written` as i32; under
506 // the Phase B envelope this is statically the schema's
507 // `return_layout.root_size` (no pointer-indirect StoreField
508 // bumps the tail cursor). Legacy entries ignore this value.
509 let buffer_return_size = buffer_schema
510 .as_ref()
511 .map(|s| s.return_layout.root_size as u32)
512 .unwrap_or(0);
513 // Phase E.1: build the const-data pool by walking every
514 // function body in `ir`. The blob is shipped to the host
515 // alongside the JIT engine and copied to the arena prefix at
516 // every dispatch so `Op::ConstString { idx }` resolves to a
517 // stable arena-relative offset.
518 let const_pool = ConstPool::from_module(&ir)?;
519 // Phase E.2: collect every IR sibling function (non-entry,
520 // non-lambda) so the LLVM emit pass can lower them alongside
521 // the entry. The entry's `Op::Call` lowering resolves
522 // user-defined sibling calls through the returned helper
523 // table.
524 //
525 // Phase F.W7: collect the lambdas (funcs registered in
526 // `closure_table`) separately so the emit pass can apply the
527 // widened `(state, captures_ptr, ...params) -> ret` signature
528 // and seed the closure function-pointer table. The IR's
529 // `closure_table` maps a `fn_table_idx` to an `ir.funcs`
530 // index; we mirror that order so the emit pass's
531 // `closure_fn_table[fn_table_idx]` matches what `MakeClosure`
532 // references.
533 let lambda_ir_idx_set: std::collections::HashSet<u32> =
534 ir.closure_table.iter().copied().collect();
535 let helpers: Vec<&relon_ir::ir::Func> = ir
536 .funcs
537 .iter()
538 .enumerate()
539 .filter(|(i, _)| *i != entry_idx && !lambda_ir_idx_set.contains(&(*i as u32)))
540 .map(|(_, f)| f)
541 .collect();
542 let helper_ir_indices: Vec<u32> = ir
543 .funcs
544 .iter()
545 .enumerate()
546 .filter(|(i, _)| *i != entry_idx && !lambda_ir_idx_set.contains(&(*i as u32)))
547 .map(|(i, _)| i as u32)
548 .collect();
549 let lambdas: Vec<&relon_ir::ir::Func> = ir
550 .closure_table
551 .iter()
552 .map(|&ir_idx| &ir.funcs[ir_idx as usize])
553 .collect();
554 let emit = match world_mode {
555 WorldMode::OpenWorld => emit_module_funcs,
556 WorldMode::ClosedWorld => emit_module_funcs_closed_world,
557 };
558 let (_llvm_fn, entry_shape, helper_table, closure_fn_table) = emit(
559 ctx_static,
560 &module,
561 entry,
562 buffer_return_size,
563 &const_pool,
564 &helpers,
565 Some(&helper_ir_indices),
566 &lambdas,
567 &ir.closure_table,
568 &ir.imports,
569 )?;
570
571 // Stage 2.⑤ closed-world: link + inline the host shim bitcode
572 // into the JIT module so the direct `call @<host_symbol>` sites
573 // fold into the host body during the O3 pass below. Done before
574 // the fast-entry emit so a fast entry (Int-only, no native call)
575 // is unaffected; closed-world sources always take the buffer
576 // entry because they carry an `Op::CallNative`.
577 if matches!(world_mode, WorldMode::ClosedWorld) {
578 let shim = host_shim_src.ok_or_else(|| {
579 LlvmError::Codegen(
580 "from_ir_inner_world: ClosedWorld requires a host_shim_src".into(),
581 )
582 })?;
583 crate::cocompile::link_and_inline_host_shim(&module, shim, &ir.imports)?;
584 }
585
586 // Phase D.1 / D.2: attempt to emit the typed fast-path entry
587 // alongside the buffer entry whenever the schema qualifies.
588 // Emission failure is treated as a "no fast path available"
589 // condition rather than a hard error — the IR can stay on
590 // the buffer entry, which is correct (just slower).
591 //
592 // We discover eligibility from the `buffer_schema` (declared
593 // `#main` params + return) and the IR body. Sources that
594 // touch ops outside the fast envelope (strings, sandbox
595 // traps, etc.) fail emission inside `emit_fast_entry`; we
596 // capture the error to the IR dump for post-mortem and
597 // continue with the buffer-only module.
598 //
599 // Closure modules are stateful even when their outer schema
600 // looks like a single-Int fast shape: lambda bodies receive the
601 // real `ArenaState` so they can read captures from the arena and
602 // participate in bounds/trap semantics. The typed fast entry has
603 // no state pointer, so keep it off for any closure table entry.
604 // The wasm/object path already applies this same routing rule.
605 let fast_profile = buffer_schema
606 .as_ref()
607 .filter(|_| ir.closure_table.is_empty())
608 .and_then(|s| build_fast_path_profile(s).ok());
609 let fast_path_auto_dispatch = !body_may_raise_typed_trap(&entry.body);
610 let mut fast_emit_diagnostic: Option<String> = None;
611 if let Some(profile) = fast_profile.as_ref() {
612 match emit_fast_entry(
613 ctx_static,
614 &module,
615 entry,
616 profile,
617 &helper_table,
618 &closure_fn_table,
619 ) {
620 Ok(_) => {}
621 Err(e) => {
622 fast_emit_diagnostic = Some(format!("{e}"));
623 // Roll back the partially-emitted fast entry so
624 // the module verifies cleanly with just the
625 // buffer entry. inkwell's `delete` is unsafe
626 // because it invalidates any outstanding
627 // `FunctionValue` handle; the emitter dropped
628 // its handle when `emit_fast_entry` returned.
629 if let Some(f) = module.get_function(ENTRY_SYMBOL_FAST) {
630 unsafe { f.delete() };
631 }
632 }
633 }
634 }
635
636 module
637 .verify()
638 .map_err(|e| LlvmError::Codegen(format!("LLVM verifier rejected module: {e}")))?;
639
640 // Pin every function to the RUNTIME host CPU before MCJIT
641 // codegen. The MCJIT engine builders take no MCPU, so without
642 // this the X86 backend lowers for generic x86-64 and drops the
643 // host `SlowDivide64` narrowing — every i64 `%` / `/` becomes a
644 // bare microcoded `idivq` instead of the host `shrq $32; je;
645 // divl` fast path. The O3 pipeline and the static object-emit
646 // path already target the host; this brings the JIT backend in
647 // line. Stamping `target-cpu` / `target-features` (host-queried,
648 // never hard-coded) is the lever inkwell 0.9 / MCJIT exposes.
649 // Results are byte-identical to the generic lowering — this is a
650 // codegen-quality / instruction-selection fix, not a semantics
651 // change.
652 stamp_host_target_attributes(&module);
653
654 // Run LLVM's `-O3` middle-end pipeline on the module before
655 // handing it to MCJIT. MCJIT's `OptimizationLevel::Aggressive`
656 // controls backend codegen optimizations (regalloc, instr
657 // selection) but does **not** invoke the IR-level passes —
658 // `mem2reg`, `instcombine`, `gvn`, `licm`, loop-unroll,
659 // SLP-vectorize, etc. live in the middle-end pipeline. Without
660 // them the emitted IR's alloca-heavy stack-machine lowering
661 // hits the assembler unsimplified, leaving a 100×+ gap vs the
662 // equivalent native Rust hot loop.
663 //
664 // The pipeline is built fresh through `PassBuilderOptions`
665 // (the LLVM 17+ new pass manager) since inkwell 0.9 deprecates
666 // the legacy `PassManager` for IR-level work on LLVM 18.
667 // Debug: capture pre-opt IR if the env requests it via
668 // `RELON_LLVM_DUMP_PREOPT=1`. The pre-opt shape is mostly
669 // alloca / load / store noise but is useful when verifying
670 // that emitter changes survived the dispatch path (post-opt
671 // IR can have aggressive constant folding that makes brand-
672 // new branches invisible). The flag is intentionally opt-in
673 // so production paths never pay the second IR dump.
674 let preopt_dump: Option<String> = std::env::var_os("RELON_LLVM_DUMP_PREOPT")
675 .map(|_| module.print_to_string().to_string());
676
677 run_default_o3_pipeline(&module)?;
678
679 // Capture the dumped IR *after* the optimizer ran so tests
680 // that assert on the IR see the post-opt shape (mem2reg /
681 // loop simplification visible). The pre-opt shape is mostly
682 // alloca / load / store noise.
683 let mut ir_dump = module.print_to_string().to_string();
684 if let Some(p) = preopt_dump {
685 ir_dump = format!("; --- PRE-OPT IR ---\n{p}\n; --- POST-OPT IR ---\n{ir_dump}");
686 }
687
688 // Phase L profile-first: dump post-O3 IR + host-targeted ASM
689 // to `$RELON_LLVM_DUMP_DIR/` when the env var is set. The dump
690 // mirrors the actual MCJIT codegen path (same TargetMachine
691 // knobs as `run_default_o3_pipeline`) so the .s file matches
692 // what the JIT engine actually emits at JIT-resolve time.
693 if let Some(dir) = std::env::var_os("RELON_LLVM_DUMP_DIR") {
694 let dir = std::path::PathBuf::from(dir);
695 let _ = std::fs::create_dir_all(&dir);
696 let _ = std::fs::write(dir.join("module.post_o3.ll"), &ir_dump);
697 // Re-create a TargetMachine matching the JIT path so the
698 // dumped ASM is byte-equivalent to what MCJIT codegen
699 // hands to the loader. The codegen-side OptLevel for MCJIT
700 // is `Aggressive` (see `create_jit_execution_engine` call
701 // below); mirror that here.
702 if let Ok(()) = Target::initialize_native(&InitializationConfig::default()) {
703 let triple_str = TargetMachine::get_default_triple();
704 if let Ok(target) = Target::from_triple(&triple_str) {
705 let cpu = TargetMachine::get_host_cpu_name();
706 let features = TargetMachine::get_host_cpu_features();
707 if let Ok(triple_utf8) = triple_str.as_str().to_str() {
708 let triple = TargetTriple::create(triple_utf8);
709 if let Some(machine) = target.create_target_machine(
710 &triple,
711 cpu.to_str().unwrap_or(""),
712 features.to_str().unwrap_or(""),
713 OptimizationLevel::Aggressive,
714 RelocMode::Default,
715 CodeModel::JITDefault,
716 ) {
717 let _ = machine.write_to_file(
718 &module,
719 FileType::Assembly,
720 &dir.join("module.s"),
721 );
722 let _ = machine.write_to_file(
723 &module,
724 FileType::Object,
725 &dir.join("module.o"),
726 );
727 }
728 // Dump variant: CodeModel::Small + RelocMode::PIC
729 // so we can A/B with `module.s` and see whether the
730 // recursive call shrinks to a PC-rel `callq <sym>`.
731 if let Some(machine) = target.create_target_machine(
732 &triple,
733 cpu.to_str().unwrap_or(""),
734 features.to_str().unwrap_or(""),
735 OptimizationLevel::Aggressive,
736 RelocMode::PIC,
737 CodeModel::Small,
738 ) {
739 let _ = machine.write_to_file(
740 &module,
741 FileType::Assembly,
742 &dir.join("module.small_pic.s"),
743 );
744 }
745 // Dump variant: CodeModel::Small + RelocMode::Static.
746 if let Some(machine) = target.create_target_machine(
747 &triple,
748 cpu.to_str().unwrap_or(""),
749 features.to_str().unwrap_or(""),
750 OptimizationLevel::Aggressive,
751 RelocMode::Static,
752 CodeModel::Small,
753 ) {
754 let _ = machine.write_to_file(
755 &module,
756 FileType::Assembly,
757 &dir.join("module.small_static.s"),
758 );
759 }
760 }
761 }
762 }
763 }
764
765 // Phase L codegen-quality: pick the MCJIT engine builder by
766 // whether the module references the host-side `contains` shim.
767 //
768 // - **No extern** -> use the custom memory manager + Small
769 // CodeModel. All same-module calls collapse to direct
770 // `callq <pcrel32>` instead of MCJIT's default
771 // `movabsq + callq *%reg` (Large CodeModel). For tight
772 // recursive bodies like W7 fib this saves ~0.2 ns / call
773 // on Intel; multiplied by fib(22)'s ~35 k call tree it
774 // closes ~10 µs of the gap vs the rustc LTO build.
775 //
776 // - **Extern present** -> stay on the default JIT builder
777 // (Large CodeModel) because the host-side shim lives in
778 // the executable's `.text` which is typically > 2 GB away
779 // from the JIT's freshly-mmap'd code arena. A 32-bit
780 // PC-relative relocation would fail to resolve; the Large
781 // CodeModel's `movabsq + indirect` pattern handles it.
782 //
783 // Detection is purely structural — we look up the shim
784 // symbol on the module. The emitter declares it lazily, so
785 // its presence means "this module has at least one extern
786 // call site that needs `add_global_mapping` after engine
787 // creation".
788 // Phase 0b: the native-dispatch helper is also a host-resident
789 // extern (it lives in this crate's `.text`, not the JIT arena),
790 // so a module that references it must stay on the default JIT
791 // builder (Large CodeModel) for the same ±2 GB-relocation reason
792 // the `str.contains` shim does.
793 let uses_extern_shim = module
794 .get_function(crate::str_helpers::RELON_LLVM_STR_CONTAINS_ARENA_SYMBOL)
795 .is_some()
796 || module
797 .get_function(crate::str_helpers::RELON_LLVM_F64_TO_STR_SYMBOL)
798 .is_some()
799 || module
800 .get_function(crate::state::RELON_LLVM_CALL_NATIVE_SYMBOL)
801 .is_some();
802 let force_default_mcjit = std::env::var_os("RELON_LLVM_FORCE_DEFAULT_MCJIT").is_some();
803 let engine = if uses_extern_shim || force_default_mcjit {
804 module
805 .create_jit_execution_engine(OptimizationLevel::Aggressive)
806 .map_err(|e| LlvmError::Codegen(format!("create_jit_execution_engine: {e}")))?
807 } else {
808 let mm = crate::mcjit_mm::ContiguousCodeMemoryManager::new();
809 module
810 .create_mcjit_execution_engine_with_memory_manager(
811 mm,
812 OptimizationLevel::Aggressive,
813 inkwell::targets::CodeModel::Small,
814 /*no_frame_pointer_elim=*/ false,
815 /*enable_fast_isel=*/ false,
816 )
817 .map_err(|e| {
818 LlvmError::Codegen(format!(
819 "create_mcjit_execution_engine_with_memory_manager (Small CodeModel): {e}"
820 ))
821 })?
822 };
823
824 // Phase F.1: wire the host shim that backs the LLVM AOT
825 // `contains(haystack, needle) -> Bool` fast path. The emitter
826 // declares this symbol with `Linkage::External` whenever a
827 // module references it; MCJIT needs an explicit address
828 // mapping because the default resolver (`dlsym`) cannot see
829 // statics from inside the current dylib's strip-able section
830 // layout. We register unconditionally — if the module never
831 // referenced the symbol the mapping is a no-op.
832 if let Some(shim_fn) =
833 module.get_function(crate::str_helpers::RELON_LLVM_STR_CONTAINS_ARENA_SYMBOL)
834 {
835 engine.add_global_mapping(
836 &shim_fn,
837 crate::str_helpers::relon_llvm_str_contains_arena_addr(),
838 );
839 }
840
841 // Wave B: same constraint for the float-render shim — the
842 // `Op::FloatToStr` lowering declares `relon_llvm_f64_to_str`
843 // as an external function whose body lives in this dylib's
844 // `.text`. No-op when the module never rendered a Float.
845 if let Some(shim_fn) = module.get_function(crate::str_helpers::RELON_LLVM_F64_TO_STR_SYMBOL)
846 {
847 engine.add_global_mapping(&shim_fn, crate::str_helpers::relon_llvm_f64_to_str_addr());
848 }
849
850 // Phase 0b: map the native-dispatch helper symbol to its host
851 // address so an emitted `call @relon_llvm_call_native` resolves.
852 // The default MCJIT resolver (`dlsym`) cannot see the static
853 // from inside this dylib's section layout — same constraint as
854 // the `str.contains` shim. No-op when the module never emitted
855 // a `CallNative` (the symbol is absent).
856 if let Some(cn_fn) = module.get_function(crate::state::RELON_LLVM_CALL_NATIVE_SYMBOL) {
857 engine.add_global_mapping(&cn_fn, crate::state::relon_llvm_call_native_addr());
858 }
859
860 let entry_ptr = engine.get_function_address(ENTRY_SYMBOL).map_err(|e| {
861 LlvmError::Codegen(format!(
862 "ExecutionEngine could not resolve `{ENTRY_SYMBOL}`: {e}"
863 ))
864 })?;
865
866 // Phase D.1: resolve the typed fast-entry pointer when the
867 // module exported one. Resolution failure here is *not* an
868 // emit-side bug — the symbol simply wasn't emitted (or was
869 // rolled back) — so we treat it as "no fast path" silently.
870 //
871 // Pairing invariant: this is the *only* assignment of the
872 // `fast_entry_ptr`/`fast_path_arity` pair. Both arms set the
873 // two together, so a live pointer always carries the profile's
874 // real arity and the `(Some ptr, missing arity)` state cannot
875 // exist — the hot dispatch reads the arity without any
876 // `Option` check or panic path.
877 let (fast_entry_ptr, fast_path_arity) = match (&fast_profile, &fast_emit_diagnostic) {
878 (Some(profile), None) => match engine.get_function_address(ENTRY_SYMBOL_FAST) {
879 Ok(ptr) => (Some(ptr), profile.arg_offsets.len()),
880 Err(_) => (None, 0),
881 },
882 _ => (None, 0),
883 };
884 // Stash the fast-emit diagnostic (if any) into the IR dump so
885 // post-mortem tests can assert on it without needing a
886 // dedicated getter. The dump is only consumed by tests so the
887 // overhead doesn't matter at runtime.
888 let ir_dump = match fast_emit_diagnostic {
889 Some(diag) => format!("; fast-emit diagnostic: {diag}\n{ir_dump}"),
890 None => ir_dump,
891 };
892
893 Ok(Self {
894 jit: JitOwned {
895 _engine: engine,
896 entry_ptr,
897 fast_entry_ptr,
898 ir_dump,
899 _ctx: ctx_box,
900 },
901 entry_shape,
902 entry_arity: entry.params.len(),
903 param_names,
904 buffer_schema,
905 fast_path_arity,
906 fast_path_auto_dispatch,
907 const_data: const_pool.bytes,
908 native_imports: ir.imports.clone(),
909 host_fns: Arc::new(crate::state::HostFnRegistry::new()),
910 caps_mask: 0,
911 step_budget: AtomicI64::new(0),
912 })
913 }
914
915 /// Number of `#main` arguments expected. Under the buffer-protocol
916 /// shape this is the count of declared `#main(...)` params (from
917 /// the source schema), not the entry function's IR arity (which
918 /// is always 5 for buffer protocol). Under the legacy-i64 shape
919 /// the two coincide.
920 pub fn arity(&self) -> usize {
921 self.param_names.len()
922 }
923
924 /// Names of the declared `#main` parameters in declaration order.
925 pub fn param_names(&self) -> &[String] {
926 &self.param_names
927 }
928
929 /// Phase 0b: the `#native` imports the lowering pass interned for
930 /// this module, in `import_idx` order. Lets a host map fn names to
931 /// the slots [`Self::with_host_fns`] fills. Mirrors the cranelift
932 /// backend's `native_imports`.
933 pub fn native_imports(&self) -> &[relon_ir::ir::NativeImport] {
934 &self.native_imports
935 }
936
937 /// Phase 0b: register the host's `Arc<dyn RelonFunction>` callables
938 /// for source-lowered native-fn dispatch. Each entry is keyed by the
939 /// source-level fn name; this matches the name to the `import_idx`
940 /// the lowering pass assigned (via [`Self::native_imports`]) and
941 /// installs the callable in the evaluator's `import_idx`-keyed
942 /// registry. A source-lowered `Op::CallNative` then dispatches to it
943 /// through the `relon_llvm_call_native` helper. Names with no
944 /// matching `#native` import are skipped. Mirrors the cranelift
945 /// backend's `with_host_fns`.
946 ///
947 /// The capability *guard* is enforced independently by the
948 /// `Op::CheckCap` prologue against the granted `caps` mask
949 /// ([`Self::with_granted_cap`]) — registering a callable does not
950 /// grant its capability.
951 pub fn with_host_fns(
952 mut self,
953 host_fns: &std::collections::HashMap<String, Arc<dyn relon_eval_api::RelonFunction>>,
954 ) -> Self {
955 let mut registry = crate::state::HostFnRegistry::new();
956 for (idx, imp) in self.native_imports.iter().enumerate() {
957 if let Some(func) = host_fns.get(&imp.name) {
958 registry.register(idx as u32, Arc::clone(func));
959 }
960 }
961 self.host_fns = Arc::new(registry);
962 self
963 }
964
965 /// Phase 0b: grant a capability bit so the source-lowered
966 /// `Op::CheckCap` prologue passes at runtime. Sets bit `bit` in the
967 /// `caps` bitmask the buffer entry receives as its trailing `i64`
968 /// param. Decoupled from the analyze-time `caps`: a host can grant
969 /// statically (build passes the reachability check) yet withhold
970 /// here to exercise a stricter runtime posture (the gated call then
971 /// traps `CapabilityDenied`). Mirrors the cranelift backend's
972 /// `with_granted_cap` outcome class.
973 pub fn with_granted_cap(mut self, bit: u32) -> Self {
974 if bit < 64 {
975 self.caps_mask |= 1i64 << bit;
976 }
977 self
978 }
979
980 /// Phase 0b: set the full `caps` bitmask wholesale (the trailing
981 /// `i64` param the buffer entry's `Op::CheckCap` gate tests).
982 /// Companion to [`Self::with_granted_cap`] for hosts that already
983 /// hold a packed mask.
984 pub fn with_caps(mut self, caps_mask: i64) -> Self {
985 self.caps_mask = caps_mask;
986 self
987 }
988
989 /// Configure the LLVM buffer-entry step budget. `None` disables
990 /// the budget. `Some(n)` permits `n` entry/loop budget ticks before
991 /// the JIT records `ResourceExhausted` and the host lifts it to
992 /// `RuntimeError::StepLimitExceeded`.
993 pub fn set_step_budget(&self, steps: Option<u64>) {
994 self.step_budget
995 .store(step_budget_to_i64(steps), Ordering::Relaxed);
996 }
997
998 /// Builder-style companion to [`Self::set_step_budget`].
999 pub fn with_step_budget(self, steps: Option<u64>) -> Self {
1000 self.set_step_budget(steps);
1001 self
1002 }
1003
1004 /// Fast-path entry mirroring `AotEvaluator::run_main_legacy_i64`:
1005 /// skip the HashMap pack and invoke the JIT entry with a slice of
1006 /// positional i64 args. Only valid for the legacy-i64 entry shape.
1007 pub fn run_main_legacy_i64(&self, args: &[i64]) -> Result<i64, RuntimeError> {
1008 if self.entry_shape != EntryShape::LegacyI64 {
1009 return Err(RuntimeError::Unsupported {
1010 reason: "llvm-aot: run_main_legacy_i64 called on buffer-protocol entry".into(),
1011 });
1012 }
1013 if args.len() != self.entry_arity {
1014 return Err(RuntimeError::Unsupported {
1015 reason: format!(
1016 "llvm-aot: #main expects {} arg(s), got {}",
1017 self.entry_arity,
1018 args.len()
1019 ),
1020 });
1021 }
1022 let ptr = self.jit.entry_ptr;
1023 // SAFETY: see Phase A `run_main_legacy_i64` for the same
1024 // transmute-and-call pattern. The cached `entry_ptr` was
1025 // returned by `ExecutionEngine::get_function_address` at
1026 // construction time and stays valid for the engine's
1027 // lifetime.
1028 unsafe {
1029 match self.entry_arity {
1030 0 => {
1031 let f: LegacyEntryFn0 = std::mem::transmute(ptr);
1032 Ok(f())
1033 }
1034 1 => {
1035 let f: LegacyEntryFn1 = std::mem::transmute(ptr);
1036 Ok(f(args[0]))
1037 }
1038 2 => {
1039 let f: LegacyEntryFn2 = std::mem::transmute(ptr);
1040 Ok(f(args[0], args[1]))
1041 }
1042 3 => {
1043 let f: LegacyEntryFn3 = std::mem::transmute(ptr);
1044 Ok(f(args[0], args[1], args[2]))
1045 }
1046 4 => {
1047 let f: LegacyEntryFn4 = std::mem::transmute(ptr);
1048 Ok(f(args[0], args[1], args[2], args[3]))
1049 }
1050 n => Err(RuntimeError::Unsupported {
1051 reason: format!("llvm-aot: arity {n} > MAX_LEGACY_ARITY={MAX_LEGACY_ARITY}"),
1052 }),
1053 }
1054 }
1055 }
1056
1057 /// Print the emitted LLVM IR. Useful for tests / benchmarks that
1058 /// want to assert against the lowering output without leaving
1059 /// the test binary.
1060 pub fn emit_ir_dump(&self) -> &str {
1061 &self.jit.ir_dump
1062 }
1063
1064 /// Phase D.1: does this evaluator have a JIT-resident fast entry
1065 /// the host can dispatch through when args are all-Int? Exposed
1066 /// for the smoke tests that assert the fast path is wired up;
1067 /// benches use it to log which row hit the fast vs buffer path.
1068 pub fn has_fast_path(&self) -> bool {
1069 self.jit.fast_entry_ptr.is_some()
1070 }
1071
1072 /// Phase D.1: arity of the typed fast entry, when one was emitted.
1073 /// Matches `arity()` for source-driven entries that qualify; `None`
1074 /// when the source falls back to the buffer-only path.
1075 pub fn fast_path_arity(&self) -> Option<usize> {
1076 self.jit.fast_entry_ptr.map(|_| self.fast_path_arity)
1077 }
1078
1079 /// Phase L codegen-quality debug helper: raw address of the typed
1080 /// fast-entry function in the JIT-allocated code arena. Returns
1081 /// `None` if the source falls back to the buffer entry. Hosts use
1082 /// this to disassemble the MCJIT-produced machine code at runtime
1083 /// (`xxd` / `objdump --disassemble-all` on a byte slice) — useful
1084 /// for confirming whether the engine emitted direct `callq <pcrel>`
1085 /// vs the Large-CodeModel `movabsq + callq *%reg` shape.
1086 pub fn fast_entry_runtime_addr(&self) -> Option<usize> {
1087 self.jit.fast_entry_ptr
1088 }
1089
1090 /// Phase L codegen-quality debug helper: raw address of the
1091 /// buffer-protocol entry function in the JIT-allocated code arena.
1092 /// Always populated for a successful `from_source` build.
1093 pub fn entry_runtime_addr(&self) -> usize {
1094 self.jit.entry_ptr
1095 }
1096
1097 /// The running host's LLVM CPU name (e.g. `broadwell`, `znver3`),
1098 /// as queried by `TargetMachine::get_host_cpu_name`. This is the
1099 /// exact value stamped as the `"target-cpu"` function attribute on
1100 /// every JIT'd function so the MCJIT backend lowers for the CPU it
1101 /// runs on (and emits the host idiv-narrowing fast path rather than
1102 /// a generic bare `idivq`). Exposed so capability tests can confirm
1103 /// the stamp is the runtime host, never a hard-coded literal.
1104 pub fn host_target_cpu() -> String {
1105 TargetMachine::get_host_cpu_name()
1106 .to_str()
1107 .unwrap_or("")
1108 .to_string()
1109 }
1110
1111 /// Phase D.1 dispatch-boundary fast path: invoke the typed fast
1112 /// entry directly with positional `i64` args. Bypasses the
1113 /// `HashMap` pack, `BufferBuilder` writes, arena setup, and
1114 /// `BufferReader` decode entirely — the call chain is
1115 /// `Rust caller → cached fn pointer → JIT body → i64 return`.
1116 ///
1117 /// Returns `Err(Unsupported)` when the evaluator was built without
1118 /// a fast entry (source past the Int-only envelope, or
1119 /// constructed via `from_ir_direct`).
1120 pub fn run_main_legacy_i64_fast(&self, args: &[i64]) -> Result<i64, RuntimeError> {
1121 let ptr = self
1122 .jit
1123 .fast_entry_ptr
1124 .ok_or_else(|| RuntimeError::Unsupported {
1125 reason:
1126 "llvm-aot: fast entry not available; source not Int-only or fast-emit failed"
1127 .into(),
1128 })?;
1129 // Pairing invariant (single assignment site in
1130 // `from_ir_inner_world`): `fast_path_arity` is always set
1131 // together with `fast_entry_ptr`, so a live pointer means the
1132 // bare-`usize` arity is the profile's real value — no `Option`
1133 // unwrap, no panic landing pad on the per-call path. (An
1134 // `expect` here once de-inlined this function under fat LTO
1135 // and regressed the W12 kernel from 3.55ns to 9.46ns/call.)
1136 let arity = self.fast_path_arity;
1137 if args.len() != arity {
1138 return Err(RuntimeError::Unsupported {
1139 reason: format!(
1140 "llvm-aot fast path: #main expects {arity} arg(s), got {}",
1141 args.len()
1142 ),
1143 });
1144 }
1145 // SAFETY: the cached pointer came back from
1146 // `ExecutionEngine::get_function_address(ENTRY_SYMBOL_FAST)`
1147 // which guarantees the symbol is live for the engine's
1148 // lifetime. The arity-specialised dispatch table mirrors the
1149 // typed signature `emit_fast_entry` produced for this
1150 // function shape.
1151 unsafe {
1152 let r = match arity {
1153 0 => {
1154 let f: FastEntryFn0 = std::mem::transmute(ptr);
1155 f()
1156 }
1157 1 => {
1158 let f: FastEntryFn1 = std::mem::transmute(ptr);
1159 f(args[0])
1160 }
1161 2 => {
1162 let f: FastEntryFn2 = std::mem::transmute(ptr);
1163 f(args[0], args[1])
1164 }
1165 3 => {
1166 let f: FastEntryFn3 = std::mem::transmute(ptr);
1167 f(args[0], args[1], args[2])
1168 }
1169 4 => {
1170 let f: FastEntryFn4 = std::mem::transmute(ptr);
1171 f(args[0], args[1], args[2], args[3])
1172 }
1173 5 => {
1174 let f: FastEntryFn5 = std::mem::transmute(ptr);
1175 f(args[0], args[1], args[2], args[3], args[4])
1176 }
1177 6 => {
1178 let f: FastEntryFn6 = std::mem::transmute(ptr);
1179 f(args[0], args[1], args[2], args[3], args[4], args[5])
1180 }
1181 7 => {
1182 let f: FastEntryFn7 = std::mem::transmute(ptr);
1183 f(
1184 args[0], args[1], args[2], args[3], args[4], args[5], args[6],
1185 )
1186 }
1187 8 => {
1188 let f: FastEntryFn8 = std::mem::transmute(ptr);
1189 f(
1190 args[0], args[1], args[2], args[3], args[4], args[5], args[6], args[7],
1191 )
1192 }
1193 n => {
1194 return Err(RuntimeError::Unsupported {
1195 reason: format!("llvm-aot fast path: arity {n} > 8 dispatch cap"),
1196 });
1197 }
1198 };
1199 Ok(r)
1200 }
1201 }
1202
1203 /// Try the fast path first: when the schema qualifies and every
1204 /// supplied arg is `Int`, dispatch through the typed JIT entry
1205 /// and wrap the i64 result. Returns `Ok(None)` when the fast
1206 /// path isn't applicable for this call (caller falls back to the
1207 /// buffer entry). `Ok(Some(v))` on a successful fast dispatch;
1208 /// `Err` only when the dispatch itself failed.
1209 fn try_run_main_fast(
1210 &self,
1211 args: &HashMap<String, Value>,
1212 ) -> Result<Option<Value>, RuntimeError> {
1213 if self.jit.fast_entry_ptr.is_none() {
1214 return Ok(None);
1215 }
1216 if !self.fast_path_auto_dispatch {
1217 return Ok(None);
1218 }
1219 // Pairing invariant: `fast_path_arity` is assigned together
1220 // with `fast_entry_ptr` at the single resolution site, so
1221 // reaching here (entry ptr is Some) means the bare-`usize`
1222 // arity is live — no `Option` unwrap on the dispatch path.
1223 let arity = self.fast_path_arity;
1224 if arity != self.param_names.len() {
1225 // Schema arity mismatch — shouldn't happen if
1226 // `build_fast_path_profile` agreed, but be defensive.
1227 return Ok(None);
1228 }
1229 let mut argv = [0i64; 8];
1230 for (i, name) in self.param_names.iter().enumerate() {
1231 match args.get(name) {
1232 Some(Value::Int(v)) => argv[i] = *v,
1233 _ => return Ok(None), // missing or non-Int arg → fall back
1234 }
1235 }
1236 let r = self.run_main_legacy_i64_fast(&argv[..arity])?;
1237 // Phase D.2: re-wrap the i64 result to match the buffer
1238 // path's `Value` shape. The fast-path profile gate accepts
1239 // both the canonical `Ret { value: Int }` wrapper (Phase
1240 // D.1 — surfaces as bare `Value::Int`) and any user-declared
1241 // anon-record return collapsed to a single Int field (Phase
1242 // D.2 — surfaces as `Value::Dict { <field_name>: Int }` to
1243 // match `run_main_buffer`'s `read_record_into_map` decode).
1244 // `is_single_value_wrapper` discriminates the two — strict
1245 // canonical name match → bare scalar; otherwise → branded
1246 // dict.
1247 if let Some(schema) = self.buffer_schema.as_ref() {
1248 if is_single_value_wrapper(&schema.return_schema) {
1249 Ok(Some(Value::Int(r)))
1250 } else {
1251 let field_name = schema.return_schema.fields[0].name.clone();
1252 let mut map: HashMap<String, Value> = HashMap::with_capacity(1);
1253 map.insert(field_name, Value::Int(r));
1254 Ok(Some(Value::branded_dict(
1255 map,
1256 Some(schema.return_schema.name.clone()),
1257 )))
1258 }
1259 } else {
1260 Ok(Some(Value::Int(r)))
1261 }
1262 }
1263
1264 /// Buffer-protocol `run_main`: pack the HashMap-keyed args into
1265 /// an arena, invoke the JIT, decode the return record.
1266 fn run_main_buffer(&self, args: HashMap<String, Value>) -> Result<Value, RuntimeError> {
1267 let schema = self
1268 .buffer_schema
1269 .as_ref()
1270 .ok_or_else(|| RuntimeError::Unsupported {
1271 reason: "llvm-aot: run_main_buffer called without schema metadata".into(),
1272 })?;
1273
1274 // 1. Pack the args into a buffer using `BufferBuilder`.
1275 let mut builder = relon_eval_api::buffer::BufferBuilder::new(
1276 &schema.main_layout,
1277 &schema.main_schema.fields,
1278 );
1279 for field in &schema.main_schema.fields {
1280 let value = args
1281 .get(&field.name)
1282 .ok_or_else(|| RuntimeError::Unsupported {
1283 reason: format!("llvm-aot: missing #main arg `{}`", field.name),
1284 })?;
1285 write_value_into_builder(&mut builder, field, value)?;
1286 }
1287 // F1: bake `in_ptr` into every input pointer slot (arena-absolute
1288 // convention), so the JIT body's param reads drop their `+ in_ptr`
1289 // rebase. `in_ptr` depends only on const-data length.
1290 let in_ptr_pre = relon_util::align_up(
1291 u32::try_from(self.const_data.len()).map_err(|_| {
1292 RuntimeError::IoError("llvm const-data section exceeds u32 range".into())
1293 })?,
1294 8,
1295 );
1296 let in_bytes = builder
1297 .finish_arena_absolute(in_ptr_pre)
1298 .map_err(buffer_to_runtime_error)?;
1299
1300 // 2. Lay out the arena. Phase E.1 widens the layout to match
1301 // the cranelift backend: `[const_data | pad | in_buf | pad |
1302 // out_buf (root + tail cap) | pad | scratch]`. The const-data
1303 // pool lives at offset 0; ConstString-emitted offsets point
1304 // directly at the records inside it. The scratch region at
1305 // the tail backs the bump allocator (`AllocScratchDyn`).
1306 let in_len = in_bytes.len() as u32;
1307 let out_root_size = schema.return_layout.root_size as u32;
1308 // For String / List return types we reserve a chunky tail-
1309 // cursor region so pointer-indirect StoreField can stamp the
1310 // payload past the fixed-area slot without re-allocating on
1311 // every dispatch.
1312 let needs_pointer_indirect_return = return_needs_tail_region(&schema.return_schema);
1313 // Cap the output region:
1314 // * fixed area: max(root_size, 8) padded to 8.
1315 // * tail area: 64 KiB cushion for String returns (W3 hits
1316 // ~3 KiB per dispatch at STRING_CONCAT_N = 3 000; a 64 KiB
1317 // reservation keeps the bump path away from arena edges
1318 // without ballooning the allocation).
1319 let tail_cap: u32 = if needs_pointer_indirect_return {
1320 65_536
1321 } else {
1322 0
1323 };
1324 let out_cap = relon_util::align_up(out_root_size.max(8) + tail_cap + 16, 8);
1325 let const_data_len = u32::try_from(self.const_data.len()).map_err(|_| {
1326 RuntimeError::IoError("llvm const-data section exceeds u32 range".into())
1327 })?;
1328 let in_ptr = relon_util::align_up(const_data_len, 8);
1329 let out_ptr = relon_util::align_up(in_ptr + in_len, 8);
1330 let scratch_base = relon_util::align_up(out_ptr + out_cap, 8);
1331 // Scratch region size: 64 KiB matches the cranelift backend's
1332 // figure; the W3 hot-loop concat writes ~3*N bytes total but
1333 // the scratch cursor never resets within a dispatch (each
1334 // iteration's intermediate string sticks around until
1335 // run-end) so we need enough headroom for the worst-case
1336 // W3-style `O(N^2)` allocation pattern.
1337 let scratch_size: u32 = 1_048_576; // 1 MiB
1338 let arena_size = (scratch_base + scratch_size) as usize;
1339
1340 // 3. Acquire the per-thread arena buffer, install the
1341 // input bytes, dispatch. Reentrant calls (a stdlib helper
1342 // looping back through the evaluator on the same thread)
1343 // fall back to a fresh `Vec<u8>` — correctness wins over
1344 // pool reuse on the vanishingly rare path.
1345 LLVM_ARENA_POOL.with(|cell| match cell.try_borrow_mut() {
1346 Ok(mut buf) => self.dispatch_with_arena(
1347 schema,
1348 &mut buf,
1349 arena_size,
1350 in_ptr,
1351 in_len,
1352 out_ptr,
1353 out_cap,
1354 scratch_base,
1355 &in_bytes,
1356 ),
1357 Err(_) => {
1358 let mut fallback: Vec<u8> = Vec::new();
1359 self.dispatch_with_arena(
1360 schema,
1361 &mut fallback,
1362 arena_size,
1363 in_ptr,
1364 in_len,
1365 out_ptr,
1366 out_cap,
1367 scratch_base,
1368 &in_bytes,
1369 )
1370 }
1371 })
1372 }
1373
1374 /// Inner driver shared by the pooled-arena and fallback-arena
1375 /// branches of [`Self::run_main_buffer`]. Resizes `arena` to
1376 /// `arena_size`, copies `in_bytes` into the input region,
1377 /// invokes the JIT, then decodes the output region.
1378 #[allow(clippy::too_many_arguments)]
1379 fn dispatch_with_arena(
1380 &self,
1381 schema: &BufferSchema,
1382 arena: &mut Vec<u8>,
1383 arena_size: usize,
1384 in_ptr: u32,
1385 in_len: u32,
1386 out_ptr: u32,
1387 out_cap: u32,
1388 scratch_base: u32,
1389 in_bytes: &[u8],
1390 ) -> Result<Value, RuntimeError> {
1391 if arena.len() < arena_size {
1392 arena.resize(arena_size, 0);
1393 }
1394 // Zero only the region the JIT can observe before writing —
1395 // const_data is overwritten in full, in_bytes are copied on
1396 // top of the input area, the out region must read as zero
1397 // because pointer-indirect StoreField bumps into a
1398 // freshly-zero tail cursor, and the scratch tail is written
1399 // before being read by the JIT itself.
1400 let observable_end = (out_ptr + out_cap) as usize;
1401 debug_assert!(observable_end <= arena_size);
1402 debug_assert!(self.const_data.len() <= in_ptr as usize);
1403 arena[self.const_data.len()..observable_end].fill(0);
1404 if !self.const_data.is_empty() {
1405 arena[..self.const_data.len()].copy_from_slice(&self.const_data);
1406 }
1407 arena[in_ptr as usize..in_ptr as usize + in_bytes.len()].copy_from_slice(in_bytes);
1408
1409 let live_arena = &mut arena[..arena_size];
1410 let state = ArenaState::new(live_arena, scratch_base);
1411 state.set_step_budget(self.step_budget.load(Ordering::Relaxed));
1412 // Phase 0b: point the per-call state at the host-fn registry so
1413 // a source-lowered `Op::CallNative` resolves through
1414 // `relon_llvm_call_native`. The registry lives on the evaluator
1415 // behind an `Arc` and outlives this dispatch.
1416 // SAFETY: `self.host_fns` is kept alive for the whole call (and
1417 // the evaluator's lifetime); the per-call state is the sole
1418 // owner of the `UnsafeCell` for the dispatch's duration.
1419 unsafe {
1420 state.install_host_fns(Arc::as_ptr(&self.host_fns));
1421 }
1422 let state_ptr: *const ArenaState = &state;
1423
1424 // SAFETY: same pattern as the cranelift backend's
1425 // `invoke_buffer_entry`. The JIT entry was emitted with the
1426 // canonical buffer-protocol signature; the cached fn pointer
1427 // is alive for the engine's lifetime. The arena slice
1428 // `live_arena` outlives the JIT call.
1429 let bytes_written = {
1430 let f: BufferEntryFn = unsafe { std::mem::transmute(self.jit.entry_ptr) };
1431 std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| unsafe {
1432 f(
1433 state_ptr,
1434 in_ptr as i32,
1435 in_len as i32,
1436 out_ptr as i32,
1437 out_cap as i32,
1438 /*caps=*/ self.caps_mask,
1439 )
1440 }))
1441 .map_err(|_| RuntimeError::Unsupported {
1442 reason: "llvm-aot: JIT entry panicked (no trap-code recovery in Phase B)".into(),
1443 })?
1444 };
1445
1446 // Phase 0b: a `CheckCap` deny or a failed `CallNative` dispatch
1447 // returns the negative sentinel and records the precise cause in
1448 // `state.trap_code`. Lift it to a typed `RuntimeError` (the same
1449 // outcome class the cranelift backend surfaces) before the
1450 // generic negative-bytes_written path.
1451 let trap_code = state.trap_code();
1452 if trap_code != 0 {
1453 return Err(crate::state::NativeTrap::runtime_error_from_code(trap_code));
1454 }
1455 // Decode the buffer return out of the arena. The decode is
1456 // backend-shared and arena-source-agnostic (host JIT arena here;
1457 // wasm linear memory in the wasm-evaluator path) — see
1458 // [`Self::decode_buffer_return`].
1459 self.decode_buffer_return(
1460 schema,
1461 arena,
1462 ArenaRegions {
1463 const_data_len: self.const_data.len(),
1464 in_ptr,
1465 in_len,
1466 out_ptr,
1467 out_cap,
1468 scratch_base,
1469 arena_size,
1470 },
1471 bytes_written,
1472 )
1473 }
1474
1475 /// Decode a buffer-protocol return out of an arena, given the raw
1476 /// i32 the entry returned (`bytes_written` / sentinel) and the arena
1477 /// region boundaries.
1478 ///
1479 /// This is the **single** post-call decode the native JIT path and
1480 /// the wasm-evaluator path share. It is deliberately source-agnostic:
1481 /// `arena` is just `&[u8]` (the host JIT arena, or a slice of wasm
1482 /// linear memory rebased to the arena origin), and every region
1483 /// offset in `regions` is arena-relative, so the wasm host can hand
1484 /// the same view and offsets the JIT path computes.
1485 ///
1486 /// Two paths, identical to the historical inline decode:
1487 /// - **negative** `ret`: the in-place region-walk sentinel
1488 /// `-(root_abs + 1)`. We recover `root_abs`, then defer entirely to
1489 /// the backend-shared `relon_eval_api::inplace_return` pipeline
1490 /// (region-select → **verifier** → in-place decode). The verifier
1491 /// is non-negotiable: an unverified buffer is never decoded, on the
1492 /// wasm linear-memory path exactly as on the host path.
1493 /// - **non-negative** `ret`: the fixed-area / tail-cursor return; the
1494 /// `BufferReader` walks `out_buf`.
1495 fn decode_buffer_return(
1496 &self,
1497 schema: &BufferSchema,
1498 arena: &[u8],
1499 regions: ArenaRegions,
1500 ret: i32,
1501 ) -> Result<Value, RuntimeError> {
1502 // In-place region-walk return ABI (S2): a negative return value
1503 // is the in-place sentinel `-(root_abs + 1)`. Instead of a value
1504 // copied into `out_buf`, the machine code reports the
1505 // arena-relative offset of the return root — a `List<List<scalar>>`,
1506 // `List<String>`, or `List<Schema>` value sourced from a `#main`
1507 // parameter identity.
1508 // We rebase it to its source region, run the bounds verifier over
1509 // the whole reachable graph confined to that region, and only on
1510 // a clean verify decode the value in place. A verifier failure is
1511 // a loud error — we never decode an unverified in-place return.
1512 // The decode pipeline (sentinel → region-select → verifier →
1513 // decode) is shared with the cranelift backend via
1514 // `relon_eval_api::inplace_return`, and reused verbatim by the
1515 // wasm host (the arena is then a slice of wasm linear memory).
1516 if ret < 0 {
1517 let root_abs = relon_eval_api::inplace_return::decode_inplace_sentinel(ret)?;
1518 if !is_single_value_wrapper(&schema.return_schema) {
1519 return Err(RuntimeError::IoError(
1520 "llvm-aot in-place return on a non-single-value return schema".into(),
1521 ));
1522 }
1523 return relon_eval_api::inplace_return::decode_inplace_return(
1524 "llvm-aot",
1525 arena,
1526 regions,
1527 root_abs,
1528 &schema.return_schema.fields[0],
1529 &schema.return_layout,
1530 &schema.return_schema.fields,
1531 );
1532 }
1533 let bw = ret as usize;
1534
1535 let read_len = bw.max(schema.return_layout.root_size);
1536 let out_ptr = regions.out_ptr as usize;
1537 let read_end = out_ptr + read_len;
1538 if read_end > regions.arena_size || read_end > arena.len() {
1539 return Err(RuntimeError::IoError(
1540 "llvm-aot arena too small for return decode".into(),
1541 ));
1542 }
1543 let arena = &arena[..regions.arena_size.min(arena.len())];
1544 // Object / fixed-area return path: the shared central entry gates
1545 // the record through the multi-region bounds verifier BEFORE any
1546 // decode (verify → decode is enforced inside, so no object-return
1547 // caller can skip it), then walks the backend-shared object-field
1548 // reader. Under the F1 arena-absolute slot convention the object
1549 // head sits at `out_ptr` and every pointer slot it carries is an
1550 // arena-absolute offset, so the reader + verifier walk the **whole
1551 // arena** anchored at `out_ptr`. The gate confines every followed
1552 // span to one region (today all in `out`; cross-region object
1553 // fields stay capped — F1b releases them) and closes the red-line
1554 // gap where the object path previously decoded with no verifier.
1555 relon_eval_api::inplace_return::decode_object_return(
1556 "llvm-aot",
1557 arena,
1558 out_ptr,
1559 regions,
1560 &schema.return_layout,
1561 &schema.return_schema,
1562 is_single_value_wrapper(&schema.return_schema),
1563 )
1564 }
1565
1566 /// Plan a wasm buffer-protocol dispatch: pack the `#main` args into
1567 /// the input record and compute the same arena layout
1568 /// `run_main_buffer` lays for the host JIT.
1569 ///
1570 /// The wasm host (wasmtime) lays the returned [`WasmBufferDispatch`]
1571 /// into linear memory, invokes the exported buffer entry, then hands
1572 /// the post-call arena view back to [`Self::wasm_buffer_decode`]. The
1573 /// arena layout, the const-data prefix, and the input packing are
1574 /// **byte-identical** to the host path, so the wasm module — which is
1575 /// the same LLVM IR retargeted to wasm32 — observes exactly the arena
1576 /// the JIT body was emitted against. The single divergence is the
1577 /// arena's absolute base in memory (a host `Vec` vs. a wasm
1578 /// linear-memory offset), which the wasm body absorbs through its
1579 /// `arena_base` global; every offset here is arena-relative.
1580 pub fn wasm_buffer_plan(
1581 &self,
1582 args: &HashMap<String, Value>,
1583 ) -> Result<WasmBufferDispatch, RuntimeError> {
1584 let schema = self
1585 .buffer_schema
1586 .as_ref()
1587 .ok_or_else(|| RuntimeError::Unsupported {
1588 reason: "llvm-aot: wasm_buffer_plan called without schema metadata".into(),
1589 })?;
1590
1591 // Pack the input record exactly as `run_main_buffer` does.
1592 let mut builder = relon_eval_api::buffer::BufferBuilder::new(
1593 &schema.main_layout,
1594 &schema.main_schema.fields,
1595 );
1596 for field in &schema.main_schema.fields {
1597 let value = args
1598 .get(&field.name)
1599 .ok_or_else(|| RuntimeError::Unsupported {
1600 reason: format!("llvm-aot: missing #main arg `{}`", field.name),
1601 })?;
1602 write_value_into_builder(&mut builder, field, value)?;
1603 }
1604 // F1: bake `in_ptr` into every input pointer slot (arena-absolute
1605 // convention) — identical to `run_main_buffer`, so the wasm module
1606 // (same IR retargeted) sees the same input bytes.
1607 let in_ptr_pre = relon_util::align_up(
1608 u32::try_from(self.const_data.len()).map_err(|_| {
1609 RuntimeError::IoError("llvm const-data section exceeds u32 range".into())
1610 })?,
1611 8,
1612 );
1613 let in_bytes = builder
1614 .finish_arena_absolute(in_ptr_pre)
1615 .map_err(buffer_to_runtime_error)?;
1616
1617 // Lay out the arena identically to `run_main_buffer`.
1618 let in_len = in_bytes.len() as u32;
1619 let out_root_size = schema.return_layout.root_size as u32;
1620 let needs_pointer_indirect_return = return_needs_tail_region(&schema.return_schema);
1621 let tail_cap: u32 = if needs_pointer_indirect_return {
1622 65_536
1623 } else {
1624 0
1625 };
1626 let out_cap = relon_util::align_up(out_root_size.max(8) + tail_cap + 16, 8);
1627 let const_data_len = u32::try_from(self.const_data.len()).map_err(|_| {
1628 RuntimeError::IoError("llvm const-data section exceeds u32 range".into())
1629 })?;
1630 let in_ptr = relon_util::align_up(const_data_len, 8);
1631 let out_ptr = relon_util::align_up(in_ptr + in_len, 8);
1632 let scratch_base = relon_util::align_up(out_ptr + out_cap, 8);
1633 let scratch_size: u32 = 1_048_576;
1634 let arena_size = (scratch_base + scratch_size) as usize;
1635
1636 Ok(WasmBufferDispatch {
1637 const_data: self.const_data.clone(),
1638 in_bytes,
1639 regions: ArenaRegions {
1640 const_data_len: self.const_data.len(),
1641 in_ptr,
1642 in_len,
1643 out_ptr,
1644 out_cap,
1645 scratch_base,
1646 arena_size,
1647 },
1648 })
1649 }
1650
1651 /// Decode a wasm buffer-protocol return. `arena` is a slice of the
1652 /// wasm linear memory **rebased to the arena origin** (i.e.
1653 /// `&memory[arena_abs .. arena_abs + arena_size]`), so the
1654 /// arena-relative offsets in `regions` and the arena-relative root in
1655 /// the negative sentinel resolve exactly as they do on the host JIT
1656 /// path. `ret` is the i32 the wasm entry returned.
1657 ///
1658 /// This routes through the **same** [`Self::decode_buffer_return`] the
1659 /// host path uses — the in-place sentinel still runs the
1660 /// `relon_eval_api::inplace_return` verifier over the linear-memory
1661 /// slice before any decode. There is no wasm-specific decode or
1662 /// wasm-specific verifier.
1663 pub fn wasm_buffer_decode(
1664 &self,
1665 arena: &[u8],
1666 regions: ArenaRegions,
1667 ret: i32,
1668 ) -> Result<Value, RuntimeError> {
1669 let schema = self
1670 .buffer_schema
1671 .as_ref()
1672 .ok_or_else(|| RuntimeError::Unsupported {
1673 reason: "llvm-aot: wasm_buffer_decode called without schema metadata".into(),
1674 })?;
1675 self.decode_buffer_return(schema, arena, regions, ret)
1676 }
1677}
1678
1679/// A planned wasm buffer-protocol dispatch produced by
1680/// [`LlvmAotEvaluator::wasm_buffer_plan`]: the const-data prefix, the
1681/// packed input record, and the full arena region layout. The wasm host
1682/// lays `const_data` at arena offset 0 and `in_bytes` at
1683/// `regions.in_ptr`, invokes the entry symbol it emitted, then decodes
1684/// via [`LlvmAotEvaluator::wasm_buffer_decode`].
1685#[derive(Debug, Clone)]
1686pub struct WasmBufferDispatch {
1687 /// Const-pool blob; laid at arena offset 0 (before `in_ptr`).
1688 pub const_data: Vec<u8>,
1689 /// Packed input record; laid at `regions.in_ptr`.
1690 pub in_bytes: Vec<u8>,
1691 /// Arena region boundaries (all arena-relative).
1692 pub regions: ArenaRegions,
1693}
1694
1695impl Evaluator for LlvmAotEvaluator {
1696 fn eval(&self, _node: &Node, _scope: &Arc<Scope>) -> Result<Value, RuntimeError> {
1697 Err(RuntimeError::Unsupported {
1698 reason: "llvm-aot: `eval` is not supported".into(),
1699 })
1700 }
1701
1702 fn eval_root(&self, _scope: &Arc<Scope>) -> Result<Value, RuntimeError> {
1703 Err(RuntimeError::Unsupported {
1704 reason: "llvm-aot: `eval_root` is not supported".into(),
1705 })
1706 }
1707
1708 fn run_main(&self, args: HashMap<String, Value>) -> Result<Value, RuntimeError> {
1709 // Phase D.1 dispatch-boundary fast path: try the typed entry
1710 // first. Falls through to the buffer-protocol path on
1711 // mismatch (non-Int args, schema past the Int-only envelope,
1712 // no fast entry emitted) — transparent to the host.
1713 if let Some(v) = self.try_run_main_fast(&args)? {
1714 return Ok(v);
1715 }
1716 match self.entry_shape {
1717 EntryShape::Buffer => self.run_main_buffer(args),
1718 EntryShape::LegacyI64 => {
1719 // Pack the HashMap into a positional i64 argv using
1720 // the declared parameter order.
1721 let mut argv = [0i64; MAX_LEGACY_ARITY];
1722 for (i, name) in self.param_names.iter().enumerate() {
1723 let v = args.get(name).ok_or_else(|| RuntimeError::Unsupported {
1724 reason: format!("llvm-aot: missing #main arg `{name}`"),
1725 })?;
1726 match v {
1727 Value::Int(n) => argv[i] = *n,
1728 other => {
1729 return Err(RuntimeError::Unsupported {
1730 reason: format!(
1731 "llvm-aot: legacy-i64 #main arg `{name}` is {} (Int only)",
1732 other.type_name()
1733 ),
1734 });
1735 }
1736 }
1737 }
1738 let r = self.run_main_legacy_i64(&argv[..self.entry_arity])?;
1739 Ok(Value::Int(r))
1740 }
1741 }
1742 }
1743
1744 fn force_thunk(&self, _thunk: &Arc<Thunk>) -> Result<Value, RuntimeError> {
1745 Err(RuntimeError::Unsupported {
1746 reason: "llvm-aot: `force_thunk` is not supported".into(),
1747 })
1748 }
1749
1750 fn invoke_closure(
1751 &self,
1752 _closure: &ClosureData,
1753 _args: &[Value],
1754 ) -> Result<Value, RuntimeError> {
1755 Err(RuntimeError::Unsupported {
1756 reason: "llvm-aot: `invoke_closure` is not supported".into(),
1757 })
1758 }
1759}
1760
1761// ---------------------------------------------------------------------------
1762// Buffer-protocol packing / unpacking helpers.
1763//
1764// These mirror what `relon-codegen-cranelift::evaluator` does for
1765// `write_value_into_builder` / `is_single_value_wrapper` /
1766// `buffer_to_runtime_error`. The object-return *decode* side is no
1767// longer mirrored per crate — it lives once in
1768// `relon_eval_api::inplace_return::decode_object_return`. Kept inside
1769// this crate so the LLVM backend has no compile-time dep on
1770// cranelift-native.
1771// ---------------------------------------------------------------------------
1772
1773fn buffer_to_runtime_error(e: relon_eval_api::buffer::BufferError) -> RuntimeError {
1774 RuntimeError::IoError(format!("llvm-aot buffer: {e}"))
1775}
1776
1777fn is_single_value_wrapper(schema: &relon_eval_api::schema_canonical::Schema) -> bool {
1778 schema.name == relon_ir::MAIN_RETURN_SCHEMA_NAME
1779 && schema.fields.len() == 1
1780 && schema.fields[0].name == relon_ir::RETURN_VALUE_FIELD_NAME
1781}
1782
1783/// Phase D.2: looser sibling of [`is_single_value_wrapper`] used to
1784/// gate the typed-i64 fast-path. Accepts any single-field record whose
1785/// sole field is `Int` — the canonical `Ret { value: Int }` wrapper
1786/// **and** any user-declared `#main(...) -> Dict` whose anon-record
1787/// lowering collapsed to one `Int` field (W7's `{ result: Int }` is
1788/// the motivating case).
1789///
1790/// The strict [`is_single_value_wrapper`] check stays in place for the
1791/// `run_main` buffer decoder — branded user dicts must still surface
1792/// as `Value::Dict` for the host, not be unwrapped to a bare scalar.
1793fn is_single_int_field_record(schema: &relon_eval_api::schema_canonical::Schema) -> bool {
1794 use relon_eval_api::schema_canonical::TypeRepr;
1795 // A tuple schema (`is_tuple`) decodes positionally to a `Value::Tuple`,
1796 // never to a scalar / branded dict — so a 1-tuple
1797 // `Tuple<Int>` must NOT take the typed-i64 fast path (which would
1798 // return the wrong container shape). Force it onto the buffer path so
1799 // the shared `decode_object_return` tuple fork runs.
1800 !schema.is_tuple && schema.fields.len() == 1 && matches!(schema.fields[0].ty, TypeRepr::Int)
1801}
1802
1803/// Marshal a typed [`Value`] into the buffer slot for `field` on the
1804/// way *into* the JIT body (host → arena).
1805///
1806/// ## marshalling-seam contract (host side)
1807///
1808/// This dispatcher is one of the per-type marshalling seams S1.A
1809/// carved out so each leaf type owns a private `marshal_<type>_in`
1810/// helper rather than living inline in a single fat `match`. Adding a
1811/// new leaf type means: (1) add an arm here delegating to a new
1812/// `marshal_<type>_in`, (2) add the symmetric arm to the shared
1813/// object-return decoder `relon_eval_api::inplace_return` (reached via
1814/// `decode_object_return`), and (3) widen the build.rs-visible
1815/// [`EmittedFieldType`] triple (see that enum's docs).
1816///
1817/// Note: MCJIT already marshals `Float` / `Schema` here; the
1818/// build.rs-visible [`EmittedFieldType`] surface is the *narrower* set
1819/// (see [`lower_field_descriptors`]). Keep the two in mind separately —
1820/// this seam is the runtime marshaller, `EmittedFieldType` is the
1821/// AOT-binding signature surface.
1822fn write_value_into_builder(
1823 builder: &mut relon_eval_api::buffer::BufferBuilder<'_>,
1824 field: &relon_eval_api::schema_canonical::Field,
1825 value: &Value,
1826) -> Result<(), RuntimeError> {
1827 use relon_eval_api::schema_canonical::TypeRepr;
1828 match (&field.ty, value) {
1829 (TypeRepr::Int, Value::Int(v)) => marshal_int_in(builder, &field.name, *v),
1830 (TypeRepr::Float, Value::Float(v)) => {
1831 marshal_float_in(builder, &field.name, v.into_inner())
1832 }
1833 (TypeRepr::Float, Value::Int(v)) => marshal_float_in(builder, &field.name, *v as f64),
1834 (TypeRepr::Bool, Value::Bool(v)) => marshal_bool_in(builder, &field.name, *v),
1835 (TypeRepr::Unit, v) if v.is_option_none() => marshal_unit_in(builder, &field.name),
1836 (TypeRepr::String, Value::String(s)) => marshal_string_in(builder, &field.name, s),
1837 (TypeRepr::Schema { schema }, Value::Dict(dict)) if !schema.is_tuple => {
1838 marshal_schema_in(builder, &field.name, schema, dict)
1839 }
1840 (TypeRepr::Schema { schema }, Value::Tuple(items)) if schema.is_tuple => {
1841 marshal_tuple_in(builder, &field.name, schema, items.as_ref())
1842 }
1843 (TypeRepr::List { element }, Value::List(items)) => {
1844 marshal_list_in(builder, &field.name, element, items)
1845 }
1846 (TypeRepr::Option { .. } | TypeRepr::Result { .. } | TypeRepr::Enum { .. }, _) => builder
1847 .write_value(&field.name, &field.ty, value)
1848 .map_err(buffer_to_runtime_error),
1849 // ----- add new leaf marshalling arm above this line -----
1850 (ty, v) => Err(RuntimeError::Unsupported {
1851 reason: format!(
1852 "llvm-aot: #main arg `{}` got {} but schema expects {ty:?}",
1853 field.name,
1854 v.type_name()
1855 ),
1856 }),
1857 }
1858}
1859
1860// --- per-variant host-side input marshalling helpers (S1.A seam) ---
1861//
1862// One `marshal_<type>_in` per leaf type. Future Float/List lanes fill
1863// their own helper here without touching sibling arms.
1864
1865fn marshal_int_in(
1866 builder: &mut relon_eval_api::buffer::BufferBuilder<'_>,
1867 name: &str,
1868 v: i64,
1869) -> Result<(), RuntimeError> {
1870 builder.write_int(name, v).map_err(buffer_to_runtime_error)
1871}
1872
1873fn marshal_float_in(
1874 builder: &mut relon_eval_api::buffer::BufferBuilder<'_>,
1875 name: &str,
1876 v: f64,
1877) -> Result<(), RuntimeError> {
1878 builder
1879 .write_float(name, v)
1880 .map_err(buffer_to_runtime_error)
1881}
1882
1883fn marshal_bool_in(
1884 builder: &mut relon_eval_api::buffer::BufferBuilder<'_>,
1885 name: &str,
1886 v: bool,
1887) -> Result<(), RuntimeError> {
1888 builder.write_bool(name, v).map_err(buffer_to_runtime_error)
1889}
1890
1891fn marshal_unit_in(
1892 builder: &mut relon_eval_api::buffer::BufferBuilder<'_>,
1893 name: &str,
1894) -> Result<(), RuntimeError> {
1895 builder.write_unit(name).map_err(buffer_to_runtime_error)
1896}
1897
1898/// Top-level / schema `String` `#main` arg marshalling. The
1899/// pointer-indirect `BufferBuilder::write_string` appends a
1900/// `[len: u32 LE][utf8]` record into the parent buffer's tail area and
1901/// back-patches the 4-byte buffer-relative offset slot the JIT's
1902/// `LoadStringPtr` reads — the same record shape `ConstString` bakes.
1903fn marshal_string_in(
1904 builder: &mut relon_eval_api::buffer::BufferBuilder<'_>,
1905 name: &str,
1906 s: &str,
1907) -> Result<(), RuntimeError> {
1908 builder
1909 .write_string(name, s)
1910 .map_err(buffer_to_runtime_error)
1911}
1912
1913/// `List<…>` `#main` arg marshalling. Dispatches on the canonical
1914/// element type to the matching pointer-indirect `write_list_*` writer,
1915/// each of which appends the tail record (`[len][payload]` for scalar
1916/// elements, a `[len][off_0]…` pointer array of `[len][utf8]` String
1917/// records for `List<String>`) into the parent buffer's tail area and
1918/// back-patches the 4-byte buffer-relative offset slot the JIT's
1919/// `LoadList*Ptr` / pointer-indirect `LoadFieldAtAbsolute` reads — the
1920/// same shapes the ConstPool `add_list_*` blobs bake, so a list `#main`
1921/// arg and a const list return share one tail-record protocol. Element
1922/// `Value`s are type-checked against the declared element type;
1923/// `List<Schema>` (and any other element) stays a loud cap.
1924fn marshal_list_in(
1925 builder: &mut relon_eval_api::buffer::BufferBuilder<'_>,
1926 name: &str,
1927 element: &relon_eval_api::schema_canonical::TypeRepr,
1928 items: &[Value],
1929) -> Result<(), RuntimeError> {
1930 use relon_eval_api::schema_canonical::TypeRepr;
1931 let mismatch = |idx: usize, got: &Value, want: &str| RuntimeError::Unsupported {
1932 reason: format!(
1933 "llvm-aot: List<{want}> arg `{name}` element #{idx} got {} but expects {want}",
1934 got.type_name()
1935 ),
1936 };
1937 match element {
1938 TypeRepr::Int => {
1939 let mut out = Vec::with_capacity(items.len());
1940 for (i, it) in items.iter().enumerate() {
1941 match it {
1942 Value::Int(v) => out.push(*v),
1943 other => return Err(mismatch(i, other, "Int")),
1944 }
1945 }
1946 builder
1947 .write_list_int(name, &out)
1948 .map_err(buffer_to_runtime_error)
1949 }
1950 TypeRepr::Float => {
1951 let mut out = Vec::with_capacity(items.len());
1952 for (i, it) in items.iter().enumerate() {
1953 match it {
1954 Value::Float(v) => out.push(v.into_inner()),
1955 Value::Int(v) => out.push(*v as f64),
1956 other => return Err(mismatch(i, other, "Float")),
1957 }
1958 }
1959 builder
1960 .write_list_float(name, &out)
1961 .map_err(buffer_to_runtime_error)
1962 }
1963 TypeRepr::Bool => {
1964 let mut out = Vec::with_capacity(items.len());
1965 for (i, it) in items.iter().enumerate() {
1966 match it {
1967 Value::Bool(v) => out.push(*v),
1968 other => return Err(mismatch(i, other, "Bool")),
1969 }
1970 }
1971 builder
1972 .write_list_bool(name, &out)
1973 .map_err(buffer_to_runtime_error)
1974 }
1975 TypeRepr::String => {
1976 let mut out: Vec<&str> = Vec::with_capacity(items.len());
1977 for (i, it) in items.iter().enumerate() {
1978 match it {
1979 Value::String(s) => out.push(s.as_str()),
1980 other => return Err(mismatch(i, other, "String")),
1981 }
1982 }
1983 builder
1984 .write_list_string(name, &out)
1985 .map_err(buffer_to_runtime_error)
1986 }
1987 TypeRepr::Schema { schema } => marshal_list_schema_in(builder, name, schema, items),
1988 TypeRepr::List { element: inner } => marshal_list_list_in(builder, name, inner, items),
1989 TypeRepr::Option { .. } | TypeRepr::Result { .. } | TypeRepr::Enum { .. } => {
1990 let ty = TypeRepr::List {
1991 element: Box::new(element.clone()),
1992 };
1993 builder
1994 .write_value(name, &ty, &Value::List(Arc::new(items.to_vec())))
1995 .map_err(buffer_to_runtime_error)
1996 }
1997 other => Err(RuntimeError::Unsupported {
1998 reason: format!(
1999 "llvm-aot: List element type {other:?} for arg `{name}` is not yet materialised \
2000 (List<Int/Float/Bool/String/Schema> + List<List<scalar>>)"
2001 ),
2002 }),
2003 }
2004}
2005
2006/// Marshal a `List<Schema>` arg: each element is a branded
2007/// `Value::Dict` written as a sub-record into the parent buffer's tail
2008/// through [`relon_eval_api::buffer::ListRecordWriter`]. The list
2009/// header's per-entry offsets and the inner sub-records' own pointer
2010/// slots are relocated into the parent's coordinate system by
2011/// `finish_entry` / `finish_list_record`. Mirrors the cranelift backend.
2012fn marshal_list_schema_in(
2013 builder: &mut relon_eval_api::buffer::BufferBuilder<'_>,
2014 name: &str,
2015 schema: &relon_eval_api::schema_canonical::Schema,
2016 items: &[Value],
2017) -> Result<(), RuntimeError> {
2018 let elem_layout = relon_eval_api::layout::SchemaLayout::offsets_for(schema).map_err(|e| {
2019 RuntimeError::Unsupported {
2020 reason: format!("llvm-aot: List<Schema> arg `{name}` element layout: {e}"),
2021 }
2022 })?;
2023 let mut writer = builder
2024 .list_record_writer(name, &elem_layout, schema)
2025 .map_err(buffer_to_runtime_error)?;
2026 for (i, it) in items.iter().enumerate() {
2027 let mut child = writer.start_entry();
2028 match it {
2029 Value::Dict(dict) if !schema.is_tuple => {
2030 write_schema_into_builder(&mut child, schema, dict, name)?;
2031 }
2032 Value::Tuple(tuple_items) if schema.is_tuple => {
2033 write_tuple_into_builder(&mut child, schema, tuple_items.as_ref(), name)?;
2034 }
2035 other => {
2036 return Err(RuntimeError::Unsupported {
2037 reason: format!(
2038 "llvm-aot: List<Schema> arg `{name}` element #{i} got {} but expects {}",
2039 other.type_name(),
2040 if schema.is_tuple {
2041 "a tuple"
2042 } else {
2043 "a branded record"
2044 }
2045 ),
2046 });
2047 }
2048 }
2049 writer
2050 .finish_entry(builder, child)
2051 .map_err(buffer_to_runtime_error)?;
2052 }
2053 builder
2054 .finish_list_record(writer)
2055 .map_err(buffer_to_runtime_error)
2056}
2057
2058/// Marshal a nested `List<List<scalar>>` arg. Each element is itself a
2059/// `Value::List` of inline-fixed scalars (`Int` / `Float` / `Bool`)
2060/// serialised into a `[len][payload]` inner record; the outer header is
2061/// a pointer array of offsets to those records. Mirrors the cranelift
2062/// backend; inner pointer-array element lists (`List<List<String>>`)
2063/// stay a loud cap at the layout pass.
2064fn marshal_list_list_in(
2065 builder: &mut relon_eval_api::buffer::BufferBuilder<'_>,
2066 name: &str,
2067 inner: &relon_eval_api::schema_canonical::TypeRepr,
2068 items: &[Value],
2069) -> Result<(), RuntimeError> {
2070 use relon_eval_api::schema_canonical::TypeRepr;
2071 // `List<List<scalar>>` keeps the inline-fixed inner-record writer;
2072 // `List<List<String|Schema|List>>` (F5) routes through the recursive
2073 // doubly-nested pointer-array marshaller.
2074 match inner {
2075 TypeRepr::Int | TypeRepr::Float | TypeRepr::Bool => {
2076 relon_eval_api::buffer::write_nested_scalar_list(builder, name, inner, items)
2077 .map_err(buffer_to_runtime_error)
2078 }
2079 _ => relon_eval_api::buffer::write_nested_pointer_array_list(builder, name, inner, items)
2080 .map_err(buffer_to_runtime_error),
2081 }
2082}
2083
2084/// Phase 0b: Schema-typed `#main` arg marshalling. A branded
2085/// `Value::Dict` (e.g. `#main(Outer o)`) lands here.
2086/// `BufferBuilder::sub_record` / `finish_sub_record` (eval-api
2087/// Phase 9.b-1) write the sub-record into the parent buffer's tail area
2088/// and back-patch the 4-byte buffer-relative offset slot in the fixed
2089/// area — exactly the slot `LoadSchemaPtr` reads. We recurse over the
2090/// sub-fields (including nested Inner); `finish_sub_record`'s internal
2091/// `relocate_pointers` rebases the child's own pointer slots into the
2092/// parent's coordinate system.
2093fn marshal_schema_in(
2094 builder: &mut relon_eval_api::buffer::BufferBuilder<'_>,
2095 name: &str,
2096 schema: &relon_eval_api::schema_canonical::Schema,
2097 dict: &relon_eval_api::ValueDict,
2098) -> Result<(), RuntimeError> {
2099 let sub_layout = relon_eval_api::layout::SchemaLayout::offsets_for(schema).map_err(|e| {
2100 RuntimeError::Unsupported {
2101 reason: format!("llvm-aot: schema arg `{name}` layout: {e}"),
2102 }
2103 })?;
2104 let mut child = builder
2105 .sub_record(name, &sub_layout, &schema.fields)
2106 .map_err(buffer_to_runtime_error)?;
2107 write_schema_into_builder(&mut child, schema, dict, name)?;
2108 builder
2109 .finish_sub_record(name, child)
2110 .map_err(buffer_to_runtime_error)
2111}
2112
2113/// Tuple-typed `#main` arg marshalling. A tuple is a positional record
2114/// (`schema.is_tuple`) at the binary layer, with a `Value::Tuple` host
2115/// shape at the API layer.
2116fn marshal_tuple_in(
2117 builder: &mut relon_eval_api::buffer::BufferBuilder<'_>,
2118 name: &str,
2119 schema: &relon_eval_api::schema_canonical::Schema,
2120 items: &[Value],
2121) -> Result<(), RuntimeError> {
2122 let sub_layout = relon_eval_api::layout::SchemaLayout::offsets_for(schema).map_err(|e| {
2123 RuntimeError::Unsupported {
2124 reason: format!("llvm-aot: tuple arg `{name}` layout: {e}"),
2125 }
2126 })?;
2127 let mut child = builder
2128 .sub_record(name, &sub_layout, &schema.fields)
2129 .map_err(buffer_to_runtime_error)?;
2130 write_tuple_into_builder(&mut child, schema, items, name)?;
2131 builder
2132 .finish_sub_record(name, child)
2133 .map_err(buffer_to_runtime_error)
2134}
2135
2136/// Recursively fill `child` (a detached sub-record builder) with the
2137/// fields of `schema`, pulling each value out of the branded `dict`.
2138/// Nested `Schema`-typed fields recurse through
2139/// [`write_value_into_builder`]'s Schema arm, which re-enters this
2140/// helper one layer down.
2141///
2142/// `parent_field` is only used for error messages so a missing nested
2143/// field names its enclosing slot.
2144fn write_schema_into_builder(
2145 child: &mut relon_eval_api::buffer::BufferBuilder<'_>,
2146 schema: &relon_eval_api::schema_canonical::Schema,
2147 dict: &relon_eval_api::ValueDict,
2148 parent_field: &str,
2149) -> Result<(), RuntimeError> {
2150 for sub_field in &schema.fields {
2151 let sub_value =
2152 dict.map
2153 .get(sub_field.name.as_str())
2154 .ok_or_else(|| RuntimeError::Unsupported {
2155 reason: format!(
2156 "llvm-aot: schema arg `{parent_field}` is missing field `{}`",
2157 sub_field.name
2158 ),
2159 })?;
2160 write_value_into_builder(child, sub_field, sub_value)?;
2161 }
2162 Ok(())
2163}
2164
2165/// Recursively fill `child` from a tuple value, pairing positional items
2166/// with the tuple schema's synthetic `"0"`, `"1"`, ... fields.
2167fn write_tuple_into_builder(
2168 child: &mut relon_eval_api::buffer::BufferBuilder<'_>,
2169 schema: &relon_eval_api::schema_canonical::Schema,
2170 items: &[Value],
2171 parent_field: &str,
2172) -> Result<(), RuntimeError> {
2173 if items.len() != schema.fields.len() {
2174 return Err(RuntimeError::Unsupported {
2175 reason: format!(
2176 "llvm-aot: tuple arg `{parent_field}` has arity {} but schema expects {}",
2177 items.len(),
2178 schema.fields.len()
2179 ),
2180 });
2181 }
2182 for (sub_field, sub_value) in schema.fields.iter().zip(items.iter()) {
2183 write_value_into_builder(child, sub_field, sub_value)?;
2184 }
2185 Ok(())
2186}
2187
2188// The object-return field decode (`read_value_from_reader` /
2189// `read_record_into_map` and the per-type `marshal_*_out` seam) now
2190// lives once in `relon_eval_api::inplace_return` and is reached through
2191// `decode_object_return`; both AOT backends share that single copy, so a
2192// new return field type is added in exactly one place.
2193
2194/// Phase E.1: does the return schema include any pointer-indirect
2195/// type (`String` / `List*`)? Drives the output buffer's tail-cap
2196/// sizing — fixed-area-only returns don't need the 64 KiB cushion.
2197fn return_needs_tail_region(schema: &relon_eval_api::schema_canonical::Schema) -> bool {
2198 use relon_eval_api::schema_canonical::TypeRepr;
2199 schema.fields.iter().any(|f| {
2200 matches!(
2201 f.ty,
2202 TypeRepr::String
2203 | TypeRepr::List { .. }
2204 | TypeRepr::Schema { .. }
2205 | TypeRepr::Option { .. }
2206 | TypeRepr::Result { .. }
2207 | TypeRepr::Enum { .. }
2208 )
2209 })
2210}
2211
2212/// Phase D.1 / D.2: discover whether `schema` qualifies for the typed
2213/// fast-path entry. Eligibility requires every declared `#main` arg
2214/// to be `Int` (Inline scalar at 8 / 8) and the return record to
2215/// carry a single `Int` field — either the canonical
2216/// `Ret { value: Int }` wrapper (Phase D.1) or any user-declared
2217/// `#main(...) -> Dict` whose anon-record lowering collapsed to one
2218/// `Int` field (Phase D.2 — W7's `{ result: Int }` is the motivating
2219/// shape). Returns the `FastPathProfile` mapping param-declaration
2220/// Whether the typed `(i64..) -> i64` fast entry can lower `entry`'s
2221/// body. The fast entry runs with **no `*state` pointer and an empty
2222/// const-pool** (see `emit_fast_entry`), so any op that resolves
2223/// against the arena-prefix const-pool — `Op::ConstString` and the
2224/// `Op::ConstList*` family — cannot be materialised on it. Such a body
2225/// must take the buffer entry even when its `#main` schema is otherwise
2226/// fast-eligible (W4: `Int -> Int` schema over a `"axb"` string
2227/// literal). Returns `false` if any reachable op references the pool.
2228///
2229/// This is the object-emit analogue of MCJIT's
2230/// emit-fast-then-roll-back-on-failure dance: rather than emit a fast
2231/// entry, watch it fail, and delete it, we predict the failure here and
2232/// route straight to the buffer entry (the object module has no second
2233/// "buffer entry also present" fallback to fall onto).
2234fn fast_entry_emittable(entry: &relon_ir::ir::Func) -> bool {
2235 !body_references_const_pool(&entry.body)
2236}
2237
2238fn body_may_raise_typed_trap(body: &[relon_ir::ir::TaggedOp]) -> bool {
2239 use relon_ir::ir::{IrType, Op};
2240 for tagged in body {
2241 let hit = match &tagged.op {
2242 Op::Add(IrType::I64)
2243 | Op::Sub(IrType::I64)
2244 | Op::Mul(IrType::I64)
2245 | Op::Div(IrType::I64)
2246 | Op::Mod(IrType::I64)
2247 | Op::Trap { .. }
2248 | Op::CheckCap { .. }
2249 | Op::CallNative { .. } => true,
2250 Op::Block { body, .. } | Op::Loop { body, .. } => body_may_raise_typed_trap(body),
2251 Op::If {
2252 then_body,
2253 else_body,
2254 ..
2255 } => body_may_raise_typed_trap(then_body) || body_may_raise_typed_trap(else_body),
2256 Op::Call { fn_index, .. } => {
2257 let stdlib = relon_ir::stdlib::builtin_stdlib();
2258 stdlib
2259 .get(*fn_index as usize)
2260 .map(|callee| body_may_raise_typed_trap(&callee.body_owned()))
2261 .unwrap_or(true)
2262 }
2263 _ => false,
2264 };
2265 if hit {
2266 return true;
2267 }
2268 }
2269 false
2270}
2271
2272fn body_references_const_pool(body: &[relon_ir::ir::TaggedOp]) -> bool {
2273 use relon_ir::ir::Op;
2274 for tagged in body {
2275 let hit = match &tagged.op {
2276 Op::ConstString { .. }
2277 | Op::ConstListInt { .. }
2278 | Op::ConstListFloat { .. }
2279 | Op::ConstListBool { .. }
2280 | Op::ConstListString { .. } => true,
2281 Op::Block { body, .. } | Op::Loop { body, .. } => body_references_const_pool(body),
2282 Op::If {
2283 then_body,
2284 else_body,
2285 ..
2286 } => body_references_const_pool(then_body) || body_references_const_pool(else_body),
2287 // `Op::Call` inlines a bundled-stdlib body whose own const-
2288 // pool ops would resolve against the same (empty, on the fast
2289 // entry) pool. Mirror `ConstPool::collect_op`'s stdlib
2290 // recursion so a stdlib body that bakes a literal also forces
2291 // the buffer entry.
2292 Op::Call { fn_index, .. } => {
2293 let stdlib = relon_ir::stdlib::builtin_stdlib();
2294 stdlib
2295 .get(*fn_index as usize)
2296 .map(|callee| body_references_const_pool(&callee.body_owned()))
2297 .unwrap_or(false)
2298 }
2299 _ => false,
2300 };
2301 if hit {
2302 return true;
2303 }
2304 }
2305 false
2306}
2307
2308/// P3 §2.2 wasm closed-world routing: derive a per-`import_idx`
2309/// effectful flag from the IR's `Op::CheckCap` → `Op::CallNative` shape.
2310///
2311/// The IR lowering (`try_lower_native_call`) emits one `Op::CheckCap`
2312/// per capability bit a host fn's gate requires *immediately before* the
2313/// call's argument evaluation, then the `Op::CallNative`. A **pure**
2314/// host fn (empty gate) emits zero preceding CheckCaps; an **effectful**
2315/// one (reads clock / IO / side effect — gated by a capability) emits at
2316/// least one. The `NativeImport.cap_bit` carried into codegen is always
2317/// `NO_CAPABILITY_BIT` (the guard rides the CheckCap ops, not the call),
2318/// so this CheckCap-presence scan is the in-codegen signal that survives
2319/// IR lowering — no analyzer/IR change required.
2320///
2321/// Returns `effectful[i] == true` iff import index `i`'s call site is
2322/// guarded by a preceding CheckCap. Walks every function body
2323/// (entry + helpers + lambdas), maintaining a per-body count of pending
2324/// CheckCaps consumed by the next CallNative. A pure call nested inside
2325/// an effectful call's arguments carries no CheckCap of its own, so it
2326/// won't be mis-flagged.
2327fn compute_effectful_imports(ir: &relon_ir::ir::Module) -> Vec<bool> {
2328 let mut effectful = vec![false; ir.imports.len()];
2329 for func in &ir.funcs {
2330 scan_body_effectful(&func.body, &mut effectful);
2331 }
2332 effectful
2333}
2334
2335fn scan_body_effectful(body: &[relon_ir::ir::TaggedOp], effectful: &mut [bool]) {
2336 use relon_ir::ir::Op;
2337 // Pending CheckCaps in declaration order ahead of the next CallNative
2338 // in this op sequence. The lowering pins them right before the call's
2339 // args, so a non-zero count when a CallNative is reached marks that
2340 // import effectful.
2341 let mut pending_check_caps: u32 = 0;
2342 for tagged in body {
2343 match &tagged.op {
2344 Op::CheckCap { .. } => pending_check_caps += 1,
2345 Op::CallNative { import_idx, .. } => {
2346 if pending_check_caps > 0 {
2347 if let Some(slot) = effectful.get_mut(*import_idx as usize) {
2348 *slot = true;
2349 }
2350 }
2351 pending_check_caps = 0;
2352 }
2353 // Nested control flow: recurse so a CheckCap-guarded call
2354 // inside a branch / loop is still flagged. A nested block
2355 // starts its own pending count.
2356 Op::Block { body, .. } | Op::Loop { body, .. } => {
2357 scan_body_effectful(body, effectful);
2358 }
2359 Op::If {
2360 then_body,
2361 else_body,
2362 ..
2363 } => {
2364 scan_body_effectful(then_body, effectful);
2365 scan_body_effectful(else_body, effectful);
2366 }
2367 _ => {}
2368 }
2369 }
2370}
2371
2372/// order to buffer offsets when eligible.
2373fn build_fast_path_profile(schema: &BufferSchema) -> Result<FastPathProfile, ()> {
2374 use relon_eval_api::schema_canonical::TypeRepr;
2375 // Every declared #main arg must be `Int`. Pointer-indirect /
2376 // floating-point / bool / unit are out — those would require
2377 // f64 / i32 fast-entry slots we don't enumerate.
2378 for f in &schema.main_schema.fields {
2379 if !matches!(f.ty, TypeRepr::Int) {
2380 return Err(());
2381 }
2382 }
2383 // Single-Int-field record return only. Any other shape
2384 // (multi-field record, branded sub-schema with non-Int leaves,
2385 // tail-cursor String/List) escapes the typed-i64 envelope.
2386 if !is_single_int_field_record(&schema.return_schema) {
2387 return Err(());
2388 }
2389 // Collect each arg's buffer offset from the layout — declaration
2390 // order is what the JIT entry is parameterised by.
2391 let mut arg_offsets: Vec<u32> = Vec::with_capacity(schema.main_layout.fields.len());
2392 for (i, f) in schema.main_schema.fields.iter().enumerate() {
2393 // Layout's `fields` mirrors `main_schema.fields` order; cross-
2394 // check the names so a future schema reorder surfaces.
2395 let lo = schema.main_layout.fields.get(i).ok_or(())?;
2396 if lo.name != f.name {
2397 return Err(());
2398 }
2399 arg_offsets.push(lo.offset as u32);
2400 }
2401 // Arity cap — matches `emit_fast_entry`'s `arity > 8` guard.
2402 if arg_offsets.len() > 8 {
2403 return Err(());
2404 }
2405 let ret_offset = schema
2406 .return_layout
2407 .fields
2408 .first()
2409 .map(|f| f.offset as u32)
2410 .ok_or(())?;
2411 Ok(FastPathProfile {
2412 arg_offsets,
2413 ret_offset,
2414 })
2415}
2416
2417/// Run LLVM's `-O3` middle-end pipeline on `module`. The host-side
2418/// JIT engine handles backend codegen-time optimisation; this
2419/// function fills in the IR-level passes (mem2reg, instcombine, gvn,
2420/// licm, loop-unroll, SLP-vectorize, …) that MCJIT does not invoke
2421/// on its own.
2422///
2423/// The implementation lazily initialises LLVM's native target the
2424/// first time it is called — required by `Target::from_triple` /
2425/// `create_target_machine`. Subsequent calls re-use the initialised
2426/// target state.
2427/// Which ABI shape the emitted entry symbol exposes. Drives the
2428/// build.rs binding-generator's choice between a typed `(i64...) -> i64`
2429/// extern declaration (fast path) and a buffer-protocol call through
2430/// `relon-rs-shims::call_buffer_entry`.
2431#[derive(Debug, Clone, Copy, PartialEq, Eq)]
2432pub enum EmittedEntryShape {
2433 /// `extern "C" fn(i64, ...) -> i64`. Source qualified for the
2434 /// dispatch-boundary fast path (Int-only `#main(Int...) -> Int`,
2435 /// arity <= 8, no string/list/closure). The binding wraps the
2436 /// extern with a thin Rust shim.
2437 FastInt,
2438 /// Full buffer-protocol entry:
2439 /// `extern "C" fn(*const ArenaState, i32, i32, i32, i32, i64) -> i32`.
2440 /// Source has string/list arguments or returns, calls into
2441 /// stdlib helpers, or uses helper functions. The binding marshals
2442 /// typed Rust args into / out of an arena buffer through
2443 /// `relon-rs-shims::call_buffer_entry`.
2444 Buffer,
2445}
2446
2447/// One declared `#main` parameter (or `value` field on the return
2448/// schema), in declaration order. Tells the build.rs binding generator
2449/// what Rust type to expose for each slot and at what byte offset the
2450/// buffer-protocol arena writer / reader should access it.
2451#[derive(Debug, Clone)]
2452pub struct EmittedField {
2453 /// Field name as declared in source.
2454 pub name: String,
2455 /// Pre-computed byte offset of the slot inside its enclosing
2456 /// fixed area (main_params record for args, return record for
2457 /// the return slot).
2458 pub offset: u32,
2459 /// Erased canonical type tag. Build.rs maps each to the matching
2460 /// Rust type for the binding signature.
2461 pub ty: EmittedFieldType,
2462}
2463
2464/// Erased canonical type tag the build.rs binding generator uses to
2465/// pick the Rust type for each `#main` parameter / return slot.
2466///
2467/// Phase 2 covers `Int` / `Bool` / `String` / internal unit slots. Float, Lists,
2468/// nested schemas, and closure-valued returns surface as
2469/// `UnsupportedSignature` at emit-object time so the binding never
2470/// sees a type tag it can't handle.
2471///
2472/// ## Three-crate triple contract
2473///
2474/// This tag is the byte-for-byte-identical seam shared by three crates;
2475/// the enum is mirrored (not shared) so the runtime shim and build
2476/// generator don't take a dep on this codegen crate:
2477///
2478/// 1. `relon_codegen_llvm` (this enum) — produced by
2479/// [`lower_field_descriptors`].
2480/// 2. `relon_rs_shims::EmittedFieldType` — the runtime mirror;
2481/// `call_buffer_entry` packs/unpacks per variant.
2482/// 3. `relon_rs_build` — `rust_type_for` maps each variant to the Rust
2483/// surface type + `ArgValue` / `RetValue` constructor.
2484///
2485/// **Adding a variant is a four-touch change**: (1) add the variant
2486/// here + its arm in [`lower_field_descriptors`]; (2) add the mirror
2487/// variant + the `*_in` / `*_out` sibling helpers in
2488/// `relon_rs_shims::marshal`; (3) add the `rust_type_for` table row in
2489/// `relon_rs_build`; (4) extend the cross-crate round-trip guard test.
2490/// The guard test in `relon-rs-build/tests/marshal_roundtrip.rs` fails
2491/// closed if any of the three drift.
2492#[derive(Debug, Clone, Copy, PartialEq, Eq)]
2493pub enum EmittedFieldType {
2494 /// `i64`. Inline slot at offset, 8/8.
2495 Int,
2496 /// `f64`. Inline slot at offset, 8/8 (8 LE bytes, IEEE-754).
2497 Float,
2498 /// `bool`. Inline slot at offset, 1/1.
2499 Bool,
2500 /// `()`. Inline slot at offset, 1/1 (always reads as zero).
2501 Unit,
2502 /// `&str` / `String`. Pointer-indirect: fixed slot is a 4-byte
2503 /// buffer-relative offset to a `[len: u32 LE][utf8 bytes]` tail
2504 /// record. Build.rs uses `BufferBuilder::write_string` to pack
2505 /// inputs and `BufferReader::read_string` to decode outputs.
2506 String,
2507 /// `&[i64]` / `Vec<i64>`. Pointer-indirect (like `String`): the
2508 /// fixed slot is a 4-byte buffer-relative offset to a
2509 /// `[len: u32 LE][pad to 8][i64 LE …]` tail record (8/8-inline
2510 /// elements, byte-identical to the ConstPool `add_list_int` blob).
2511 /// Build.rs uses `BufferBuilder::write_list_int` to pack inputs and
2512 /// `BufferReader::read_list_int` to decode outputs.
2513 ListInt,
2514}
2515
2516/// Metadata returned by [`LlvmAotEvaluator::emit_object`] so the
2517/// build.rs caller can stamp matching `extern "C"` declarations and
2518/// marshalling code into the generated Rust shim.
2519///
2520/// The shape carried by [`Self::shape`] decides the binding shape:
2521/// fast-path entries get a thin `extern "C" fn(i64, ...) -> i64`
2522/// wrapper; buffer-protocol entries route through
2523/// `relon-rs-shims::call_buffer_entry` with typed Rust args.
2524#[derive(Debug, Clone)]
2525pub struct EmitObjectInfo {
2526 /// Exported C ABI symbol name (chosen by the caller; the emitter
2527 /// renames the JIT-side default to this).
2528 pub entry_symbol: String,
2529 /// Number of declared `#main` parameters. For fast-path entries
2530 /// this equals the C ABI arity; for buffer-protocol entries the C
2531 /// ABI arity is always 6, while this field reports the
2532 /// user-visible `#main` arity.
2533 pub entry_arity: usize,
2534 /// Declared parameter names in `#main(...)` declaration order.
2535 /// Build.rs uses these to name the Rust shim's args.
2536 pub param_names: Vec<String>,
2537 /// Which extern signature the emitted symbol carries. Drives the
2538 /// binding generator's dispatch shape.
2539 pub shape: EmittedEntryShape,
2540 /// Declared `#main` parameters with byte-offsets and type tags.
2541 /// Used by the buffer-protocol binding to pack input args into
2542 /// the arena. Empty under [`EmittedEntryShape::FastInt`] (the
2543 /// fast path reads args from positional registers, not the
2544 /// buffer).
2545 pub main_fields: Vec<EmittedField>,
2546 /// Return record fields. Phase 2 lowering always wraps the
2547 /// `#main` return in a single-field schema `Ret { value: T }`,
2548 /// so this vector has exactly one entry. Empty under
2549 /// [`EmittedEntryShape::FastInt`].
2550 pub return_fields: Vec<EmittedField>,
2551 /// Fixed-area byte size of the input record. The buffer-protocol
2552 /// binding allocates `in_len = main_root_size + tail_len_for_strings`
2553 /// bytes. Zero under [`EmittedEntryShape::FastInt`].
2554 pub main_root_size: u32,
2555 /// Fixed-area byte size of the return record. The buffer-protocol
2556 /// binding reserves at least this much in the output region.
2557 /// Zero under [`EmittedEntryShape::FastInt`].
2558 pub return_root_size: u32,
2559 /// Whether the return schema contains pointer-indirect leaves
2560 /// (`String` / `List*`) — drives the binding's tail-cap sizing.
2561 pub return_has_tail: bool,
2562 /// Const-pool blob the JIT body references through arena-relative
2563 /// i32 offsets (`Op::ConstString` records). The binding copies
2564 /// this verbatim to `arena[..const_data.len()]` before every
2565 /// dispatch. Empty under [`EmittedEntryShape::FastInt`] (the fast
2566 /// path doesn't touch the const pool).
2567 pub const_data: Vec<u8>,
2568 /// `true` when the emitted body references a host shim that lives
2569 /// in the `relon-rs-shims` staticlib (`relon_llvm_str_contains_arena`
2570 /// or Wave B's `relon_llvm_f64_to_str`). Build.rs uses this to
2571 /// decide whether to add that staticlib to the linker invocation.
2572 /// The historical name predates the second shim; semantically it
2573 /// means "needs the rs-shims staticlib".
2574 pub references_str_contains_shim: bool,
2575}
2576
2577impl LlvmAotEvaluator {
2578 /// AOT entry: compile `src` into a relocatable ELF object file
2579 /// suitable for linker consumption (build.rs path).
2580 ///
2581 /// Phase 2 envelope:
2582 ///
2583 /// - When the source qualifies for the dispatch-boundary fast
2584 /// path (Int-only `#main(Int...) -> Int`, arity <= 8, no
2585 /// pointer-indirect leaves, no stdlib call overhead), the
2586 /// emitted symbol carries the typed
2587 /// `extern "C" fn(i64, ...) -> i64` shape — the Phase 1 trivial
2588 /// path. No `SandboxState`, no const-pool, no shim
2589 /// dependency.
2590 /// - Otherwise the symbol carries the full buffer-protocol entry
2591 /// shape `extern "C" fn(*const ArenaState, i32, i32, i32, i32,
2592 /// i64) -> i32`. The build.rs binding generator routes typed
2593 /// Rust args through `relon-rs-shims::call_buffer_entry` to
2594 /// marshal them into / out of the arena.
2595 ///
2596 /// In both modes the emitter returns an [`EmitObjectInfo`] that
2597 /// carries the metadata the binding generator needs (entry shape,
2598 /// schema field offsets, const-pool blob, shim reference flag).
2599 ///
2600 /// Returns [`LlvmError::UnsupportedSignature`] when the declared
2601 /// `#main` signature mixes types Phase 2 hasn't wired marshalling
2602 /// for yet (`Float`, `List*`, nested schemas as args, closure
2603 /// returns) — Phase 3 widens the surface.
2604 pub fn emit_object(
2605 src: &str,
2606 entry_symbol: &str,
2607 out_path: &Path,
2608 ) -> Result<EmitObjectInfo, LlvmError> {
2609 // Thin wrapper preserving the historical 3-arg signature the
2610 // rs-build `emit_all` calls (Stage 2 keeps this call site
2611 // stable). Default options (no host `#native` declarations) +
2612 // open-world dispatch — byte-identical to the pre-S2.⑤ path.
2613 let options = relon_analyzer::AnalyzeOptions {
2614 strict_mode: false,
2615 ..Default::default()
2616 };
2617 Self::emit_object_with_options(
2618 src,
2619 entry_symbol,
2620 out_path,
2621 &options,
2622 WorldMode::OpenWorld,
2623 None,
2624 )
2625 }
2626
2627 /// Stage 2.⑤ options-carrying object-emit seam.
2628 ///
2629 /// Threads a caller-supplied [`relon_analyzer::AnalyzeOptions`] (so
2630 /// host `#native` declarations resolve — the W1-C capability-gate
2631 /// e2e enabler) and a [`WorldMode`] through the object-emit path.
2632 ///
2633 /// - [`WorldMode::OpenWorld`] (the [`Self::emit_object`] default):
2634 /// `Op::CallNative` lowers to the dynamic `relon_llvm_call_native`
2635 /// helper. `host_shim_src` is ignored.
2636 /// - [`WorldMode::ClosedWorld`]: `Op::CallNative` lowers to a direct
2637 /// `call @<host_symbol>`; `host_shim_src` (the `#[no_mangle]
2638 /// extern "C"` host crate) is compiled to LLVM-18 bitcode, linked
2639 /// into the emitted module, force-inlined, and folded by O3 — so
2640 /// every native call collapses to the host fn body in the `.o`.
2641 /// A `None` shim on the closed-world path is an error when the
2642 /// source actually imports a host fn.
2643 pub fn emit_object_with_options(
2644 src: &str,
2645 entry_symbol: &str,
2646 out_path: &Path,
2647 options: &relon_analyzer::AnalyzeOptions,
2648 world_mode: WorldMode,
2649 host_shim_src: Option<&str>,
2650 ) -> Result<EmitObjectInfo, LlvmError> {
2651 // Default target is the host (native x86-64 ELF). S3.X adds the
2652 // wasm32 retarget via `emit_object_for_target`.
2653 Self::emit_object_for_target(
2654 src,
2655 entry_symbol,
2656 out_path,
2657 options,
2658 world_mode,
2659 host_shim_src,
2660 CodegenTarget::Native,
2661 )
2662 }
2663
2664 /// S3.X object-emit seam parameterised by [`CodegenTarget`].
2665 ///
2666 /// `CodegenTarget::Native` is byte-identical to the historical
2667 /// [`Self::emit_object_with_options`] path. `CodegenTarget::Wasm32`
2668 /// runs the SAME relon-IR → LLVM-IR emitter but constructs a
2669 /// `wasm32-wasi` `TargetMachine` (+ stamps the module's wasm32
2670 /// triple / DataLayout) so `write_to_file` emits a `\0asm` object
2671 /// instead of an ELF `.o`. The lowered body is unchanged — `mem.rs`
2672 /// already lays the arena out via pointer-width-agnostic i32-offset
2673 /// GEPs.
2674 ///
2675 /// Wasm32 supports both worlds (P3 §2.2). Open-world routes every
2676 /// `#native` host fn through a WASI import. Closed-world co-compiles
2677 /// the **pure-compute** host fns into the wasm unit and inlines them
2678 /// (via `link_and_inline_host_shim_wasm_pure_only`), while still
2679 /// routing **effectful** (capability-gated) host fns through WASI
2680 /// imports — symmetric with the native closed-world inline.
2681 #[allow(clippy::too_many_arguments)]
2682 pub fn emit_object_for_target(
2683 src: &str,
2684 entry_symbol: &str,
2685 out_path: &Path,
2686 options: &relon_analyzer::AnalyzeOptions,
2687 world_mode: WorldMode,
2688 host_shim_src: Option<&str>,
2689 target: CodegenTarget,
2690 ) -> Result<EmitObjectInfo, LlvmError> {
2691 let (ir, main_schema, return_schema) = Self::lower_source_with_options(src, Some(options))?;
2692 let main_layout = relon_eval_api::layout::SchemaLayout::offsets_for(&main_schema)
2693 .map_err(|e| LlvmError::Codegen(format!("main schema layout: {e}")))?;
2694 let return_layout = relon_eval_api::layout::SchemaLayout::offsets_for(&return_schema)
2695 .map_err(|e| LlvmError::Codegen(format!("return schema layout: {e}")))?;
2696 let param_names: Vec<String> = main_schema.fields.iter().map(|f| f.name.clone()).collect();
2697 let schema = BufferSchema {
2698 main_schema,
2699 return_schema,
2700 main_layout,
2701 return_layout,
2702 };
2703
2704 // Materialise the per-field metadata up-front so we can hand
2705 // it back regardless of whether we end up on the fast or
2706 // buffer-protocol path. Surfaces an `UnsupportedSignature`
2707 // for type tags Phase 2 hasn't wired marshalling for yet —
2708 // the build.rs binding side can't generate a Rust wrapper
2709 // for an unknown leaf type.
2710 //
2711 // This strict projection only matters to the **build.rs binding
2712 // generator**, which consumes `main_fields` / `return_fields` to
2713 // stamp the typed Rust wrapper — that path is `Native` only. The
2714 // `Wasm32` target feeds the **wasm-evaluator host**, which packs
2715 // its input and decodes its return through `wasm_buffer_plan` /
2716 // `wasm_buffer_decode` (driven by the full `BufferSchema`), never
2717 // these erased descriptors. So a `#main` carrying a pointer-array
2718 // list param/return the binding can't marshal (e.g. an in-place
2719 // `List<List<scalar>>` / `List<String>` / `List<Schema>` identity)
2720 // must still emit a runnable wasm body. We therefore only enforce
2721 // the binding-marshallability gate on `Native`; on `Wasm32` an
2722 // unbindable leaf yields an empty descriptor vec (the wasm host
2723 // ignores it) rather than aborting the emit.
2724 let descriptors_strict = matches!(target, CodegenTarget::Native);
2725 let (main_fields, return_fields) = if descriptors_strict {
2726 (
2727 lower_field_descriptors(&schema.main_schema, &schema.main_layout)?,
2728 lower_field_descriptors(&schema.return_schema, &schema.return_layout)?,
2729 )
2730 } else {
2731 (
2732 lower_field_descriptors(&schema.main_schema, &schema.main_layout)
2733 .unwrap_or_default(),
2734 lower_field_descriptors(&schema.return_schema, &schema.return_layout)
2735 .unwrap_or_default(),
2736 )
2737 };
2738
2739 let entry_idx = ir
2740 .entry_func_index
2741 .ok_or_else(|| LlvmError::Codegen("IR module has no entry function".into()))?;
2742 let entry = &ir.funcs[entry_idx];
2743
2744 // Verify the IR carries the canonical buffer-protocol entry
2745 // signature. `lower_workspace_single` always produces this
2746 // shape today; failing the check means an IR-layer change
2747 // slipped past the test gates.
2748 if !crate::codegen::is_buffer_protocol_signature(&entry.params, entry.ret) {
2749 return Err(LlvmError::UnsupportedSignature(
2750 "relon-rs build: lowering produced a non-buffer entry shape".into(),
2751 ));
2752 }
2753
2754 // Fast-path eligibility — Int-only schema, arity <= 8, no
2755 // pointer-indirect leaves. Sources that don't qualify drop to
2756 // the buffer-protocol path below.
2757 //
2758 // Stage 2.⑤: the closed-world path always takes the buffer
2759 // entry — `Op::CallNative` needs the `*state` pointer only the
2760 // buffer entry threads (the fast entry has no state slot). An
2761 // Int-only `#main` that calls a host fn would otherwise match
2762 // the fast profile and emit an entry the native-dispatch
2763 // lowering rejects. Force buffer mode for closed-world.
2764 let fast_profile = match world_mode {
2765 WorldMode::ClosedWorld => None,
2766 // P3 §2.2: a module that calls a `#native` host fn must take
2767 // the buffer entry even when its `#main` schema is Int-only
2768 // and would otherwise match the fast profile — `Op::CallNative`
2769 // / the preceding `Op::CheckCap` need the `*state` pointer and
2770 // the trailing `caps` slot only the buffer entry threads (the
2771 // fast `(i64..)->i64` entry has neither). Same reasoning the
2772 // closed-world arm uses to force buffer mode.
2773 WorldMode::OpenWorld if !ir.imports.is_empty() => None,
2774 WorldMode::OpenWorld => build_fast_path_profile(&schema).ok(),
2775 };
2776
2777 let ctx = Context::create();
2778 let module = ctx.create_module("relon_rs_object");
2779
2780 // Phase E.1 const-pool blob; needed by buffer-protocol bodies
2781 // for `Op::ConstString { idx }` resolution. The fast path
2782 // doesn't reference the pool (Int-only bodies have no
2783 // ConstString ops) so the blob ends up empty in that branch.
2784 let const_pool = ConstPool::from_module(&ir)?;
2785
2786 // Phase D fast-entry eligibility is decided from the `#main`
2787 // schema alone (Int args, single-Int return). That envelope is
2788 // necessary but not sufficient: a fast-qualifying schema can
2789 // still wrap a body that touches ops the `(i64..) -> i64` fast
2790 // entry can't lower — most notably `Op::ConstString` /
2791 // `Op::ConstList*`, which resolve against the arena-prefix
2792 // const-pool the fast entry has no state pointer to reach (it
2793 // emits with an empty pool). W4
2794 // (`range(n).map(=>"axb").filter(s.contains("x")).len()`) is the
2795 // canonical case: an `Int -> Int` schema over a string-literal
2796 // body. The in-process MCJIT path (`from_ir_inner_world`) emits
2797 // the buffer entry first and treats a failed fast-entry emit as
2798 // a soft "no fast path", rolling the fast entry back and keeping
2799 // the buffer entry. The object-emit path historically emitted
2800 // *only* the fast entry, so the same body hard-failed here with
2801 // a `missing const-pool entry`. Mirror MCJIT: try the fast entry
2802 // first, and on emit failure fall through to the buffer entry
2803 // (which lowers `Op::ConstString` against the real const-pool).
2804 let fast_profile = match fast_profile {
2805 // W7 recursive-closure Dict: a module that declares lambdas
2806 // (`#internal fib: (k) => ... fib(...)`) can match the fast
2807 // `(i64..) -> i64` envelope (Int `#main`, single-Int `result`
2808 // field) yet its body emits `Op::MakeClosure` /
2809 // `Op::CallClosure`, which resolve a lambda FunctionValue from
2810 // the module-wide `closure_fn_table`. The fast-only object-emit
2811 // branch emits *only* the fast entry with empty helper / closure
2812 // tables (it never declares + emits the lambda bodies), so
2813 // `MakeClosure fn_table_idx=N` hits an empty table. The buffer
2814 // path routes through `emit_module_funcs`, which declares every
2815 // lambda up-front (forward reference for `fib`'s self-call) and
2816 // emits each lambda body — the only place closures lower
2817 // correctly for static object emit. Force the buffer entry
2818 // whenever the module declares any lambda. The in-process MCJIT
2819 // path (`from_ir_inner_world`) already gets this for free: it
2820 // emits the buffer module first (lambdas declared + emitted) and
2821 // only *adds* a fast entry on top, reusing the populated table.
2822 Some(profile) if fast_entry_emittable(entry) && ir.closure_table.is_empty() => {
2823 Some(profile)
2824 }
2825 _ => None,
2826 };
2827
2828 let (shape, references_str_contains_shim) = match fast_profile {
2829 Some(ref profile) => {
2830 // Fast-path entry only. Same shape the Phase 1 trivial
2831 // demo path emitted — pure i64 in / i64 out, no
2832 // SandboxState pointer, no const-pool copy.
2833 //
2834 // Phase D.2: the W7 anon-Dict-return shape needs the
2835 // module-wide helper / closure tables so the fast entry
2836 // can resolve in-body `Op::Call` / `Op::CallClosure`
2837 // sites. Empty tables are fine for Phase D.1's pure
2838 // Int-arithmetic bodies (W1) — the emitter just never
2839 // looks them up.
2840 let helper_table: HashMap<u32, FunctionValue<'_>> = HashMap::new();
2841 let closure_fn_table: Vec<FunctionValue<'_>> = Vec::new();
2842 let llvm_fn = emit_fast_entry(
2843 &ctx,
2844 &module,
2845 entry,
2846 profile,
2847 &helper_table,
2848 &closure_fn_table,
2849 )?;
2850 llvm_fn.as_global_value().set_name(entry_symbol);
2851 llvm_fn.set_linkage(Linkage::External);
2852 (EmittedEntryShape::FastInt, false)
2853 }
2854 None => {
2855 // Buffer-protocol entry. Routes through
2856 // `emit_module_funcs` so user-defined helper functions
2857 // and bundled-stdlib bodies (Phase 2 P1 surface) lower
2858 // alongside the entry.
2859 let buffer_return_size = schema.return_layout.root_size as u32;
2860 let lambda_ir_idx_set: std::collections::HashSet<u32> =
2861 ir.closure_table.iter().copied().collect();
2862 let helpers: Vec<&relon_ir::ir::Func> = ir
2863 .funcs
2864 .iter()
2865 .enumerate()
2866 .filter(|(i, _)| *i != entry_idx && !lambda_ir_idx_set.contains(&(*i as u32)))
2867 .map(|(_, f)| f)
2868 .collect();
2869 let helper_ir_indices: Vec<u32> = ir
2870 .funcs
2871 .iter()
2872 .enumerate()
2873 .filter(|(i, _)| *i != entry_idx && !lambda_ir_idx_set.contains(&(*i as u32)))
2874 .map(|(i, _)| i as u32)
2875 .collect();
2876 let lambdas: Vec<&relon_ir::ir::Func> = ir
2877 .closure_table
2878 .iter()
2879 .map(|&ir_idx| &ir.funcs[ir_idx as usize])
2880 .collect();
2881 // Stage 2.⑤ / P3 §2.2: pick the dispatch emitter by world
2882 // mode + target. Native open-world (default / rs-build
2883 // today) keeps the dynamic `relon_llvm_call_native` hop;
2884 // native closed-world lowers `Op::CallNative` to a direct
2885 // `call @<host>` that the host-bitcode link + inline below
2886 // folds away. wasm32 open-world lowers `Op::CallNative` to a
2887 // **wasm import** call (`crate::wasi_host`). wasm32
2888 // closed-world (P3 §2.2 co-compile) inlines the
2889 // **pure-compute** host fns into the wasm unit while routing
2890 // **effectful** ones (capability-gated) through wasm imports
2891 // — `effectful_imports` carries the per-import split derived
2892 // from the IR's CheckCap shape.
2893 let effectful_imports = compute_effectful_imports(&ir);
2894 let llvm_fn = match (world_mode, target) {
2895 (WorldMode::ClosedWorld, CodegenTarget::Wasm32) => {
2896 emit_module_funcs_closed_world_wasm(
2897 &ctx,
2898 &module,
2899 entry,
2900 buffer_return_size,
2901 &const_pool,
2902 &helpers,
2903 Some(&helper_ir_indices),
2904 &lambdas,
2905 &ir.closure_table,
2906 &ir.imports,
2907 &effectful_imports,
2908 )?
2909 .0
2910 }
2911 (world_mode, target) => {
2912 let emit = match (world_mode, target) {
2913 (WorldMode::OpenWorld, CodegenTarget::Wasm32) => emit_module_funcs_wasm,
2914 (WorldMode::OpenWorld, CodegenTarget::Native) => emit_module_funcs,
2915 (WorldMode::ClosedWorld, _) => emit_module_funcs_closed_world,
2916 };
2917 emit(
2918 &ctx,
2919 &module,
2920 entry,
2921 buffer_return_size,
2922 &const_pool,
2923 &helpers,
2924 Some(&helper_ir_indices),
2925 &lambdas,
2926 &ir.closure_table,
2927 &ir.imports,
2928 )?
2929 .0
2930 }
2931 };
2932 // Rename the canonical buffer entry to the build.rs-
2933 // supplied symbol and force external linkage so the
2934 // consuming binary's linker can resolve it.
2935 llvm_fn.as_global_value().set_name(entry_symbol);
2936 llvm_fn.set_linkage(Linkage::External);
2937
2938 // Closed-world: link the host shim bitcode into THIS
2939 // module + force-inline every imported host fn so the
2940 // direct `call @<host>` sites collapse to the host body.
2941 // Reuses the `crate::cocompile` link/inline orchestration.
2942 // Native links the host shim built for the host triple;
2943 // wasm32 links the host shim built for
2944 // `wasm32-unknown-unknown` so the inlined body matches the
2945 // wasm unit's pointer width. Either way only the
2946 // pre-declared (pure) host fns carry a direct `call @<host>`
2947 // to fold — effectful imports stay as wasm imports.
2948 if matches!(world_mode, WorldMode::ClosedWorld) {
2949 let shim = host_shim_src.ok_or_else(|| {
2950 LlvmError::Codegen(
2951 "emit_object_with_options: ClosedWorld requires a host_shim_src \
2952 (the #[no_mangle] extern \"C\" host crate to link + inline)"
2953 .into(),
2954 )
2955 })?;
2956 match target {
2957 CodegenTarget::Wasm32 => {
2958 crate::cocompile::link_and_inline_host_shim_wasm_pure_only(
2959 &module,
2960 shim,
2961 &ir.imports,
2962 &effectful_imports,
2963 )?;
2964 }
2965 CodegenTarget::Native => {
2966 crate::cocompile::link_and_inline_host_shim(
2967 &module,
2968 shim,
2969 &ir.imports,
2970 )?;
2971 }
2972 }
2973 }
2974
2975 // Detect whether the emitted module references any
2976 // host shim that lives in the `relon-rs-shims`
2977 // staticlib (`relon_llvm_str_contains_arena`, Wave B's
2978 // `relon_llvm_f64_to_str`) — drives build.rs's decision
2979 // to add that staticlib to the linker invocation. We
2980 // check by name lookup against the LLVM module since
2981 // the emit pass declares each extern lazily on its
2982 // first call site.
2983 let needs_shim = module
2984 .get_function(RELON_LLVM_STR_CONTAINS_ARENA_SYMBOL)
2985 .is_some()
2986 || module
2987 .get_function(crate::str_helpers::RELON_LLVM_F64_TO_STR_SYMBOL)
2988 .is_some();
2989 (EmittedEntryShape::Buffer, needs_shim)
2990 }
2991 };
2992
2993 module.verify().map_err(|e| {
2994 LlvmError::Codegen(format!("LLVM verifier rejected object module: {e}"))
2995 })?;
2996
2997 // Construct the object-emit `TargetMachine` for the requested
2998 // target up front so the same machine drives both the O3
2999 // pipeline and the backend codegen below.
3000 let (machine, target_triple) = create_object_target_machine(target)?;
3001
3002 // Stamp the module's triple + DataLayout so the lowered pointer
3003 // width / endianness match the machine. Native inherits the
3004 // host triple LLVM already uses; wasm32 needs the explicit
3005 // `wasm32-wasi` triple + 32-bit DataLayout or the
3006 // verifier/codegen would default to the host's 64-bit layout.
3007 // Pulling the DataLayout straight from the machine's target data
3008 // keeps it authoritative for whichever target we built.
3009 module.set_triple(&TargetTriple::create(&target_triple));
3010 module.set_data_layout(&machine.get_target_data().get_data_layout());
3011
3012 match target {
3013 CodegenTarget::Native => {
3014 // Stamp the host CPU onto every function so the
3015 // per-function subtarget matches the host `TargetMachine`.
3016 // Keeps the AOT and MCJIT paths consistent.
3017 stamp_host_target_attributes(&module);
3018 // Host-targeted O3 (same pipeline the JIT path uses).
3019 run_default_o3_pipeline(&module)?;
3020 }
3021 CodegenTarget::Wasm32 => {
3022 // No host-CPU stamping (x86 features are meaningless for
3023 // wasm and would mis-narrow lowering). Run O3 against the
3024 // wasm32 machine so the middle-end optimises for the wasm
3025 // target's DataLayout.
3026 let opts = PassBuilderOptions::create();
3027 module
3028 .run_passes("default<O3>", &machine, opts)
3029 .map_err(|e| LlvmError::Codegen(format!("wasm32 run_passes O3: {e}")))?;
3030 }
3031 }
3032
3033 if let Some(parent) = out_path.parent() {
3034 if !parent.as_os_str().is_empty() {
3035 std::fs::create_dir_all(parent)
3036 .map_err(|e| LlvmError::Codegen(format!("create out dir `{parent:?}`: {e}")))?;
3037 }
3038 }
3039 machine
3040 .write_to_file(&module, FileType::Object, out_path)
3041 .map_err(|e| LlvmError::Codegen(format!("write object `{out_path:?}`: {e}")))?;
3042
3043 // For the fast path the binding's arity matches the LLVM
3044 // entry signature's i64-slot count. For the buffer path
3045 // there's no per-Rust-arg correspondence with the LLVM
3046 // signature (which is always 6 slots), so we report the
3047 // user-visible `#main` arity instead.
3048 let entry_arity = main_fields.len();
3049 let main_root_size = schema.main_layout.root_size as u32;
3050 let return_root_size = schema.return_layout.root_size as u32;
3051 let return_has_tail = return_needs_tail_region(&schema.return_schema);
3052 let const_data = match shape {
3053 EmittedEntryShape::FastInt => Vec::new(),
3054 EmittedEntryShape::Buffer => const_pool.bytes,
3055 };
3056 let (main_fields_out, return_fields_out, main_root_size_out, return_root_size_out) =
3057 match shape {
3058 EmittedEntryShape::FastInt => (Vec::new(), Vec::new(), 0, 0),
3059 EmittedEntryShape::Buffer => {
3060 (main_fields, return_fields, main_root_size, return_root_size)
3061 }
3062 };
3063
3064 Ok(EmitObjectInfo {
3065 entry_symbol: entry_symbol.to_string(),
3066 entry_arity,
3067 param_names,
3068 shape,
3069 main_fields: main_fields_out,
3070 return_fields: return_fields_out,
3071 main_root_size: main_root_size_out,
3072 return_root_size: return_root_size_out,
3073 return_has_tail: matches!(shape, EmittedEntryShape::Buffer) && return_has_tail,
3074 const_data,
3075 references_str_contains_shim,
3076 })
3077 }
3078}
3079
3080/// Walk a `(Schema, OffsetTable)` pair and project the per-field
3081/// declaration into the build.rs-visible [`EmittedField`] shape. The
3082/// type tag is erased into [`EmittedFieldType`] for the Phase 2
3083/// supported leaf set; any unsupported leaf surfaces as
3084/// [`LlvmError::UnsupportedSignature`] so build.rs never generates a
3085/// binding it can't compile.
3086fn lower_field_descriptors(
3087 schema: &relon_eval_api::schema_canonical::Schema,
3088 layout: &relon_eval_api::layout::OffsetTable,
3089) -> Result<Vec<EmittedField>, LlvmError> {
3090 let mut out = Vec::with_capacity(schema.fields.len());
3091 for (i, f) in schema.fields.iter().enumerate() {
3092 let lo = layout.fields.get(i).ok_or_else(|| {
3093 LlvmError::Codegen(format!(
3094 "lower_field_descriptors: layout missing slot for field `{}`",
3095 f.name
3096 ))
3097 })?;
3098 if lo.name != f.name {
3099 return Err(LlvmError::Codegen(format!(
3100 "lower_field_descriptors: schema/layout name mismatch at slot {i}: schema=`{}`, layout=`{}`",
3101 f.name, lo.name
3102 )));
3103 }
3104 let ty = emitted_field_type_for(&f.ty).ok_or_else(|| {
3105 LlvmError::UnsupportedSignature(format!(
3106 "relon-rs build (Phase 2): field `{}` type {:?} not yet wired for marshalling",
3107 f.name, f.ty
3108 ))
3109 })?;
3110 out.push(EmittedField {
3111 name: f.name.clone(),
3112 offset: lo.offset as u32,
3113 ty,
3114 });
3115 }
3116 Ok(out)
3117}
3118
3119/// Project one canonical [`TypeRepr`] onto the build.rs-visible
3120/// [`EmittedFieldType`] tag, or `None` when the leaf isn't yet wired for
3121/// AOT-binding marshalling.
3122///
3123/// This is the per-variant accept-set table for the
3124/// [`EmittedFieldType`] triple's codegen end. To widen the AOT signature
3125/// surface (e.g. Float / List lanes), add the matching arm here — the
3126/// `None` fall-through keeps every still-unsupported leaf surfacing as
3127/// `UnsupportedSignature` rather than silently emitting a tag the shim
3128/// can't decode.
3129fn emitted_field_type_for(
3130 ty: &relon_eval_api::schema_canonical::TypeRepr,
3131) -> Option<EmittedFieldType> {
3132 use relon_eval_api::schema_canonical::TypeRepr;
3133 match ty {
3134 TypeRepr::Int => Some(EmittedFieldType::Int),
3135 TypeRepr::Float => Some(EmittedFieldType::Float),
3136 TypeRepr::Bool => Some(EmittedFieldType::Bool),
3137 TypeRepr::Unit => Some(EmittedFieldType::Unit),
3138 TypeRepr::String => Some(EmittedFieldType::String),
3139 TypeRepr::List { element } if matches!(element.as_ref(), TypeRepr::Int) => {
3140 Some(EmittedFieldType::ListInt)
3141 }
3142 // ----- add new AOT-marshallable leaf type above this line -----
3143 _ => None,
3144 }
3145}
3146
3147/// Stamp the runtime host CPU/feature set onto every function in the
3148/// module as `"target-cpu"` / `"target-features"` string function
3149/// attributes.
3150///
3151/// ## Why this exists (correctness, not a micro-opt)
3152///
3153/// The MCJIT execution engine is created without an MCPU/MAttr —
3154/// `MCJITCompilerOptions` exposes no CPU field, and inkwell's
3155/// `create_*_execution_engine*` builders take only an
3156/// [`OptimizationLevel`] (+ a `CodeModel` on the memory-manager
3157/// variant). With no CPU pinned, the X86 backend lowers for **generic
3158/// x86-64** and drops every host-tuning decision the per-CPU
3159/// `SubtargetFeatures` would have enabled. The one that bites hardest:
3160/// the `SlowDivide64` tuning that narrows a 64-bit `idivq` whose
3161/// operands provably fit in 32 bits into the host `shrq $32; je; divl`
3162/// fast path. Generic codegen always emits the bare microcoded
3163/// `idivq`, so every i64 `%` / `/` runs the slow divider at runtime.
3164///
3165/// The `default<O3>` middle-end pipeline already runs against a host
3166/// `TargetMachine` (see [`run_default_o3_pipeline`]) and the static
3167/// object-emit path bakes the host CPU into its `TargetMachine` too,
3168/// so both of those already lower for the host. Only the **MCJIT
3169/// backend codegen** was generic. LLVM resolves a function's subtarget
3170/// from its `"target-cpu"` / `"target-features"` string attributes
3171/// when present, so stamping the host values here makes the MCJIT
3172/// backend lower each function for the CPU it will actually run on —
3173/// identical results, correct host instruction selection.
3174///
3175/// The CPU/features are queried from the running host
3176/// ([`TargetMachine::get_host_cpu_name`] /
3177/// [`TargetMachine::get_host_cpu_features`]) — the SAME source the O3
3178/// pipeline uses — so this is correct on any machine and never pins a
3179/// hard-coded microarchitecture.
3180fn stamp_host_target_attributes(module: &inkwell::module::Module<'_>) {
3181 // `get_host_cpu_*` reads the running CPU via LLVM's host
3182 // introspection; no native-target init is required for these two
3183 // queries, but every caller has already initialised the native
3184 // target by this point (verify -> O3 -> engine).
3185 let cpu = TargetMachine::get_host_cpu_name();
3186 let features = TargetMachine::get_host_cpu_features();
3187 let cpu = cpu.to_str().unwrap_or("");
3188 let features = features.to_str().unwrap_or("");
3189 if cpu.is_empty() {
3190 // Host introspection failed; leave the module generic rather
3191 // than stamping an empty/bogus CPU. The engine still works,
3192 // just without host narrowing (the pre-fix behaviour).
3193 return;
3194 }
3195 let ctx = module.get_context();
3196 let cpu_attr = ctx.create_string_attribute("target-cpu", cpu);
3197 let features_attr = ctx.create_string_attribute("target-features", features);
3198 let mut func = module.get_first_function();
3199 while let Some(f) = func {
3200 // Only stamp functions with a body. Pure declarations (the
3201 // `relon_llvm_str_contains_arena` host shim, intrinsics) have
3202 // no IR to lower, and stamping a target-cpu on an external
3203 // declaration is harmless but pointless.
3204 if f.count_basic_blocks() > 0 {
3205 // Idempotent: replace any pre-existing stamp so a re-run
3206 // (or an emitter that already set one) lands on the host.
3207 f.remove_string_attribute(inkwell::attributes::AttributeLoc::Function, "target-cpu");
3208 f.remove_string_attribute(
3209 inkwell::attributes::AttributeLoc::Function,
3210 "target-features",
3211 );
3212 f.add_attribute(inkwell::attributes::AttributeLoc::Function, cpu_attr);
3213 f.add_attribute(inkwell::attributes::AttributeLoc::Function, features_attr);
3214 }
3215 func = f.get_next_function();
3216 }
3217}
3218
3219fn run_default_o3_pipeline(module: &inkwell::module::Module<'_>) -> Result<(), LlvmError> {
3220 Target::initialize_native(&InitializationConfig::default())
3221 .map_err(|e| LlvmError::Codegen(format!("initialize_native: {e}")))?;
3222 let triple_str = TargetMachine::get_default_triple();
3223 let target = Target::from_triple(&triple_str)
3224 .map_err(|e| LlvmError::Codegen(format!("target from_triple: {e}")))?;
3225 let cpu = TargetMachine::get_host_cpu_name();
3226 let features = TargetMachine::get_host_cpu_features();
3227 let triple = TargetTriple::create(
3228 triple_str
3229 .as_str()
3230 .to_str()
3231 .map_err(|e| LlvmError::Codegen(format!("triple utf8: {e}")))?,
3232 );
3233 let machine = target
3234 .create_target_machine(
3235 &triple,
3236 cpu.to_str().unwrap_or(""),
3237 features.to_str().unwrap_or(""),
3238 OptimizationLevel::Aggressive,
3239 RelocMode::Default,
3240 CodeModel::JITDefault,
3241 )
3242 .ok_or_else(|| LlvmError::Codegen("create_target_machine returned null".into()))?;
3243 let opts = PassBuilderOptions::create();
3244 module
3245 .run_passes("default<O3>", &machine, opts)
3246 .map_err(|e| LlvmError::Codegen(format!("run_passes O3: {e}")))?;
3247 Ok(())
3248}
3249
3250/// Build the object-emit `TargetMachine` for the requested
3251/// [`CodegenTarget`]. Native bakes the host CPU/features + PIC reloc;
3252/// Wasm32 initialises the WebAssembly backend and pins the
3253/// `wasm32-wasi` triple. The triple String returned alongside lets the
3254/// caller stamp the module's target-triple (the DataLayout is pulled
3255/// from the machine's target data) so the wasm object's pointer width /
3256/// endianness match the machine.
3257fn create_object_target_machine(
3258 target: CodegenTarget,
3259) -> Result<(TargetMachine, String), LlvmError> {
3260 match target {
3261 CodegenTarget::Native => {
3262 Target::initialize_native(&InitializationConfig::default())
3263 .map_err(|e| LlvmError::Codegen(format!("initialize_native: {e}")))?;
3264 let triple_str = TargetMachine::get_default_triple();
3265 let t = Target::from_triple(&triple_str)
3266 .map_err(|e| LlvmError::Codegen(format!("target from_triple: {e}")))?;
3267 let cpu = TargetMachine::get_host_cpu_name();
3268 let features = TargetMachine::get_host_cpu_features();
3269 let triple = TargetTriple::create(
3270 triple_str
3271 .as_str()
3272 .to_str()
3273 .map_err(|e| LlvmError::Codegen(format!("triple utf8: {e}")))?,
3274 );
3275 let machine = t
3276 .create_target_machine(
3277 &triple,
3278 cpu.to_str().unwrap_or(""),
3279 features.to_str().unwrap_or(""),
3280 OptimizationLevel::Aggressive,
3281 RelocMode::PIC,
3282 CodeModel::Default,
3283 )
3284 .ok_or_else(|| LlvmError::Codegen("create_target_machine returned null".into()))?;
3285 let triple_owned = triple_str
3286 .as_str()
3287 .to_str()
3288 .map_err(|e| LlvmError::Codegen(format!("triple utf8: {e}")))?
3289 .to_string();
3290 Ok((machine, triple_owned))
3291 }
3292 CodegenTarget::Wasm32 => {
3293 // The WebAssembly backend lives behind the `target-webassembly`
3294 // inkwell feature; `initialize_webassembly` registers it.
3295 Target::initialize_webassembly(&InitializationConfig::default());
3296 let triple = TargetTriple::create(WASM32_TRIPLE);
3297 let t = Target::from_triple(&triple)
3298 .map_err(|e| LlvmError::Codegen(format!("wasm32 target from_triple: {e}")))?;
3299 // No host-CPU narrowing for wasm; the MVP+ feature set is
3300 // controlled by the wasm runtime (wasmtime defaults). Reloc
3301 // is irrelevant for the wasm object model — `Static`/`Default`
3302 // both produce a relocatable `\0asm` object.
3303 //
3304 // `+bulk-memory`: lower `llvm.memcpy` / `llvm.memset` to the
3305 // native `memory.copy` / `memory.fill` ops instead of a libc
3306 // `env::memcpy` import. The pointer-indirect String / List
3307 // return-store path (`emit_store_field_pointer_indirect`)
3308 // emits a `memcpy`; without bulk-memory wasm-ld leaves an
3309 // unresolved `env::memcpy` import that no standard WASI host
3310 // satisfies. wasmtime enables bulk-memory by default, so the
3311 // emitted module stays ecosystem-portable.
3312 let machine = t
3313 .create_target_machine(
3314 &triple,
3315 /*cpu=*/ "",
3316 /*features=*/ "+bulk-memory",
3317 OptimizationLevel::Aggressive,
3318 RelocMode::Static,
3319 CodeModel::Default,
3320 )
3321 .ok_or_else(|| {
3322 LlvmError::Codegen("wasm32 create_target_machine returned null".into())
3323 })?;
3324 Ok((machine, WASM32_TRIPLE.to_string()))
3325 }
3326 }
3327}