veryl_simulator/ir.rs
1pub(crate) mod context;
2pub(crate) mod declaration;
3mod event;
4mod expression;
5pub(crate) mod inst_layout;
6mod module;
7pub(crate) mod opt;
8pub(crate) mod site_table;
9mod statement;
10pub(crate) mod variable;
11pub(crate) mod write_log;
12
13pub use context::{Context, Conv};
14pub use declaration::ProtoDeclaration;
15pub use event::Event;
16pub use expression::{Expression, ExpressionContext, ProtoDynamicBitSelect, ProtoExpression};
17pub use module::{Module, ProtoModule};
18pub use statement::{
19 CompiledBatchStmt, CompiledBlockStatement, CompiledStmt, ProtoAssignDynamicStatement,
20 ProtoAssignStatement, ProtoForBound, ProtoForRange, ProtoForStatement, ProtoIfStatement,
21 ProtoStatement, ProtoStatementBlock, ProtoStatements, ProtoSystemFunctionCall, RuntimeForBound,
22 RuntimeForRange, Statement, SystemFunctionCall, TbMethodKind, format_assert_message,
23 parse_hex_content, patch_stmt_log_buf,
24};
25pub use variable::{
26 ModuleVariableMeta, ModuleVariables, VarOffset, Variable, VariableElement, VariableMeta,
27 create_variable_meta, native_bytes, read_native_value, read_payload, value_size,
28 write_native_value, write_payload,
29};
30pub use veryl_analyzer::ir::{Op, Type, VarId, VarPath};
31pub use veryl_analyzer::value::Value;
32
33use crate::HashMap;
34use crate::backend::{self, BackendRegistry, CompiledWhole, DispatchOutcome};
35use crate::simulator::SimProfile;
36use crate::simulator_error::SimulatorError;
37use std::sync::Arc;
38use std::sync::OnceLock;
39use veryl_analyzer::ir as air;
40use veryl_analyzer::value::MaskCache;
41use veryl_parser::resource_table::StrId;
42use veryl_parser::token_range::TokenRange;
43
44pub struct Ir {
45 pub name: StrId,
46 pub token: TokenRange,
47 pub ports: HashMap<VarPath, VarId>,
48 pub ff_values: Box<[u8]>,
49 pub comb_values: Box<[u8]>,
50 pub use_4state: bool,
51 pub module_variables: ModuleVariables,
52 pub event_statements: HashMap<Event, Vec<Statement>>,
53 /// Unified comb statements: all port connections, child comb, and internal
54 /// comb combined into a single dependency-sorted list.
55 pub comb_statements: Vec<Statement>,
56 /// Number of eval_comb passes needed for full convergence.
57 /// Pre-computed from backward edges in the sorted comb statement list.
58 pub required_comb_passes: usize,
59 /// FF write site table: compile-time metadata for each FF write site,
60 /// built at ProtoModule conv time. Consumed by phases that need to
61 /// reason about FF writes statically (write-log buffer sizing, NBA
62 /// invariant checks, per-Inst metadata for MT-ready commit).
63 pub site_table: site_table::SiteTable,
64 /// Per-top-level-Inst FF byte range metadata. Foundation for
65 /// cache-line aligned padding and per-Inst independent commit.
66 pub inst_layout: inst_layout::InstLayout,
67 /// FF write log buffer. Sized at Ir construction time from
68 /// `site_table.len()`; FF writes (JIT + interpret) push entries
69 /// during event evaluation and `ff_commit_from_log` applies them
70 /// at cycle end.
71 ///
72 /// Heap-allocated (`Box`) so the buffer's address is stable across
73 /// moves of the surrounding `Ir`/`Simulator` — JIT code holds a raw
74 /// pointer baked into each `Statement::Compiled` at construction.
75 pub write_log_buffer: Box<write_log::WriteLogBuffer>,
76 /// Whether FF classification optimization is disabled.
77 pub disable_ff_opt: bool,
78 /// Diagnostic: number of nontrivial SCCs found in the pre-JIT comb
79 /// graph. Real combinational loops are rejected by `analyze_dependency`,
80 /// so any non-zero value here indicates duplicate ProtoStatements in
81 /// the simulator IR assembly. See `Module::nontrivial_comb_scc`.
82 pub nontrivial_comb_scc: usize,
83 /// Whole-comb dispatch handle. `Some` when a backend (today:
84 /// AOT-C) committed to a one-function compile via
85 /// `Backend::compile_whole_comb`; `settle_comb` invokes its
86 /// `try_dispatch` in place of per-chunk Cranelift. `None` keeps
87 /// the per-chunk loop.
88 pub whole_comb: Option<Arc<dyn CompiledWhole>>,
89 /// Snapshotted from `Config::aot_c_validate`: when set, `settle_comb` /
90 /// `step` dual-run the AOT-C and Cranelift paths and panic on divergence.
91 pub aot_c_validate: bool,
92 /// Per-event whole-event dispatch handles. When the current
93 /// event's `try_dispatch` succeeds, `step()` invokes it instead of
94 /// the per-stmt Cranelift dispatch. Built in `ProtoModule::conv`
95 /// when `Config::aot_c_event` is set and the emitter covered every
96 /// event stmt.
97 pub whole_events: HashMap<Event, Arc<dyn CompiledWhole>>,
98}
99
100impl Ir {
101 pub fn from_module(module: Module, config: &Config, token: TokenRange) -> Ir {
102 let mut ir = Ir {
103 name: module.name,
104 token,
105 ports: module.ports,
106 ff_values: module.ff_values,
107 comb_values: module.comb_values,
108 use_4state: config.use_4state,
109 module_variables: module.module_variables,
110 event_statements: module.event_statements,
111 comb_statements: module.comb_statements,
112 required_comb_passes: module.required_comb_passes,
113 write_log_buffer: {
114 let (narrow_cap, wide_cap) = write_log_capacity(&module.site_table);
115 Box::new(write_log::WriteLogBuffer::with_capacity(
116 narrow_cap, wide_cap,
117 ))
118 },
119 site_table: module.site_table,
120 inst_layout: module.inst_layout,
121 disable_ff_opt: config.disable_ff_opt,
122 nontrivial_comb_scc: module.nontrivial_comb_scc,
123 whole_comb: module.whole_comb,
124 aot_c_validate: config.aot_c_validate,
125 whole_events: module.whole_events,
126 };
127 // Bake the WriteLogBuffer's heap-stable address into every
128 // JIT-dispatched Compiled/CompiledBatch so emitted code can perform
129 // inline log pushes without a TLS lookup.
130 ir.install_write_log_ptr();
131 ir
132 }
133
134 /// Walk every event/comb statement tree and overwrite the placeholder
135 /// `log_buf` field in `Statement::Compiled` / `Statement::CompiledBatch`
136 /// with the actual heap address of `self.write_log_buffer`.
137 ///
138 /// Called once at the end of `from_module`. The address is stable
139 /// for `self`'s lifetime because the buffer lives on the heap
140 /// inside a `Box`.
141 fn install_write_log_ptr(&mut self) {
142 let log_buf =
143 (&*self.write_log_buffer) as *const _ as *mut write_log::WriteLogBuffer as *mut u8;
144 for stmts in self.event_statements.values_mut() {
145 for s in stmts {
146 patch_stmt_log_buf(s, log_buf);
147 }
148 }
149 for s in &mut self.comb_statements {
150 patch_stmt_log_buf(s, log_buf);
151 }
152 }
153
154 /// Evaluate comb for `required_comb_passes` passes.
155 ///
156 /// Real combinational loops are rejected by `analyze_dependency`
157 /// (error: `combinational_loop`), so once control reaches this function
158 /// the stmt-level graph is an acyclic DAG whose depth determines how
159 /// many passes are needed to settle. No iteration-to-convergence is
160 /// required, and no runtime "did anything change?" check is performed.
161 pub fn settle_comb(&self, mask_cache: &mut MaskCache, profile: &mut SimProfile) {
162 #[cfg(feature = "profile")]
163 {
164 profile.settle_comb_count += 1;
165 }
166 let _ = profile; // suppress unused warning when profile feature is off
167
168 // Dispatch: when a whole-comb backend (today: AOT-C) is ready,
169 // invoke it in place of per-chunk Cranelift dispatch. When
170 // VERYL_AOT_C_VALIDATE=1 (`self.aot_c_validate`) we additionally
171 // dual-run the whole-comb and the per-chunk path and panic on
172 // first divergence. Both paths fall through to Cranelift if
173 // the whole-comb backend declines (`whole_comb == None`) or
174 // returns `NotReady` (async compile pending).
175 if let Some(whole) = self.whole_comb.as_ref() {
176 // Cache env var lookups in a process-static OnceLock: settle_comb
177 // runs once per cycle, so a per-cycle `std::env::var`/getenv would
178 // be a hot-path cost.
179 static AOT_C_PASSES_OVERRIDE: OnceLock<Option<usize>> = OnceLock::new();
180 let validate = self.aot_c_validate;
181 let env_passes = *AOT_C_PASSES_OVERRIDE.get_or_init(|| {
182 std::env::var("VERYL_AOT_C_PASSES")
183 .ok()
184 .and_then(|s| s.parse::<usize>().ok())
185 });
186 let ff_ptr = self.ff_values.as_ptr();
187 let comb_ptr = self.comb_values.as_ptr() as *mut u8;
188 // AOT-C comb eval never writes the log (the emitted C does
189 // `(void)write_log`), so the pointer is unused. Pass the real
190 // heap-stable buffer address anyway to satisfy the FuncPtr
191 // contract (3rd arg is `*mut u8`).
192 let log_ptr = (&*self.write_log_buffer as *const _ as *const u8) as *mut u8;
193 let passes = env_passes.unwrap_or(self.required_comb_passes).max(1);
194
195 if !validate {
196 // Common case: passes == 1 (no SCC backward edges).
197 for _ in 0..passes {
198 match whole.try_dispatch(ff_ptr, comb_ptr, log_ptr) {
199 DispatchOutcome::Done => {}
200 DispatchOutcome::NotReady => {
201 // Async compile not finished yet — drop to
202 // Cranelift for this cycle.
203 self.run_chunked_settle(mask_cache, profile);
204 return;
205 }
206 }
207 }
208 return;
209 }
210
211 // Validate path: delegate to backend::validate, which
212 // snapshots inputs, runs whole-comb, restores, runs
213 // Cranelift, and diffs. Panics on divergence.
214 backend::validate::settle_comb(self, whole.as_ref(), passes, mask_cache, profile);
215 return;
216 }
217
218 self.run_chunked_settle(mask_cache, profile);
219 }
220
221 /// Cranelift-only settle path, factored out so the validate mode can
222 /// invoke it after AOT-C eval has run and the buffers have been restored.
223 pub(crate) fn run_chunked_settle(&self, mask_cache: &mut MaskCache, profile: &mut SimProfile) {
224 let _ = profile;
225
226 // `VERYL_MIN_PASSES_OVERRIDE` is still honoured as a debug knob.
227 static MIN_PASSES_OVERRIDE: OnceLock<Option<usize>> = OnceLock::new();
228 let min_override = *MIN_PASSES_OVERRIDE.get_or_init(|| {
229 std::env::var("VERYL_MIN_PASSES_OVERRIDE")
230 .ok()
231 .and_then(|s| s.parse().ok())
232 });
233 let passes = min_override.unwrap_or(self.required_comb_passes);
234 for _ in 0..passes {
235 self.eval_comb_full(mask_cache, profile);
236 #[cfg(feature = "profile")]
237 {
238 profile.comb_eval_count += 1;
239 }
240 }
241 }
242
243 /// Evaluate unified comb once.
244 /// Called by settle_comb() for each required pass.
245 pub fn eval_comb_full(&self, mask_cache: &mut MaskCache, profile: &mut SimProfile) {
246 let _ = profile;
247 #[cfg(feature = "profile")]
248 let start = std::time::Instant::now();
249
250 for x in &self.comb_statements {
251 dispatch_stmt_fast(x, mask_cache);
252 }
253
254 #[cfg(feature = "profile")]
255 {
256 profile.eval_comb_full_ns += start.elapsed().as_nanos() as u64;
257 }
258 }
259
260 /// Number of statements in comb_statements (for profiling).
261 pub fn comb_stmt_count(&self) -> (usize, usize, usize) {
262 let mut binary = 0;
263 let mut interp = 0;
264 let mut total = 0;
265 for s in &self.comb_statements {
266 total += 1;
267 if s.is_compiled() {
268 binary += 1;
269 } else {
270 interp += 1;
271 }
272 }
273 (total, binary, interp)
274 }
275
276 pub fn dump_variables(&self) -> String {
277 format!("{}", self.module_variables)
278 }
279
280 /// Returns (jit_count, total_count) of top-level statements across all events and comb.
281 pub fn jit_stats(&self) -> (usize, usize) {
282 let mut jit = 0;
283 let mut total = 0;
284 for stmts in self.event_statements.values() {
285 for s in stmts {
286 total += 1;
287 if s.is_compiled() {
288 jit += 1;
289 }
290 }
291 }
292 for s in &self.comb_statements {
293 total += 1;
294 if s.is_compiled() {
295 jit += 1;
296 }
297 }
298 (jit, total)
299 }
300
301 /// Returns detailed stats: (comb_jit, comb_interp, event_jit, event_interp)
302 pub fn detailed_stats(&self) -> (usize, usize, usize, usize) {
303 let mut comb_jit = 0;
304 let mut comb_interp = 0;
305 let mut event_jit = 0;
306 let mut event_interp = 0;
307 for s in &self.comb_statements {
308 if s.is_compiled() {
309 comb_jit += 1;
310 } else {
311 comb_interp += 1;
312 }
313 }
314 for stmts in self.event_statements.values() {
315 for s in stmts {
316 if s.is_compiled() {
317 event_jit += 1;
318 } else {
319 event_interp += 1;
320 }
321 }
322 }
323 (comb_jit, comb_interp, event_jit, event_interp)
324 }
325}
326
327/// Inline-friendly dispatch for the per-cycle hot loop. Handles the
328/// common JIT cases (Compiled / CompiledBatch) with a direct indirect call
329/// and falls back to `Statement::eval_step` for the interpreter path.
330///
331/// Inlining at the call site removes the (otherwise non-inlined)
332/// `Statement::eval_step` function-call frame plus the 10-arm match
333/// jump it performs.
334#[inline(always)]
335pub fn dispatch_stmt_fast(s: &Statement, mask_cache: &mut MaskCache) {
336 match s {
337 Statement::Compiled(c) => unsafe {
338 (c.artifact.func)(c.ff, c.comb, c.log_buf);
339 },
340 Statement::CompiledBatch(c) => unsafe {
341 let f = c.artifact.func;
342 for &(ff, comb) in &c.args {
343 f(ff, comb, c.log_buf);
344 }
345 },
346 _ => {
347 s.eval_step(mask_cache);
348 }
349 }
350}
351
352// SAFETY: Each Ir exclusively owns its ff_values/comb_values buffers.
353// Raw pointers in Statements point into these buffers — no cross-Ir aliasing.
354// `Arc<ChunkArtifact>` handles inside `Statement::Compiled` / `CompiledBlockStatement`
355// keep JIT code pages alive (via the artifact's keepalive field).
356// NOTE: Ir is intentionally NOT Sync. Sharing &Ir across threads would allow
357// concurrent mutation of ff_values/comb_values via interior raw pointers.
358unsafe impl Send for Ir {}
359
360/// Initial WriteLogBuffer capacities derived from the FF write site table.
361/// Returns `(narrow_cap, wide_cap)`. Narrow FFs (`native_bytes ≤ 8`) emit
362/// at most 2 entries per cycle (payload + 4-state mask); wide FFs emit at
363/// most 2 wide entries per cycle (one per payload/mask). Each contributes
364/// to its respective pool, with a ×2 over-provisioning headroom for
365/// initial dual-writes and multi-RMW chains.
366fn write_log_capacity(site_table: &site_table::SiteTable) -> (usize, usize) {
367 let mut narrow: usize = 0;
368 let mut wide: usize = 0;
369 let mut any_wide = false;
370 for s in &site_table.sites {
371 let nb = s.native_bytes as usize;
372 if nb <= 8 {
373 narrow += 2 * 2;
374 } else {
375 any_wide = true;
376 // Number of wide entries needed (≤56 byte payload per entry).
377 let chunks = nb.div_ceil(write_log::WRITE_LOG_WIDE_ENTRY_PAYLOAD_BYTES);
378 wide += 2 * chunks * 2;
379 }
380 }
381 // Narrow floor avoids tiny designs ending up with zero capacity; the
382 // wide pool stays empty when no wide sites exist so designs that only
383 // use narrow FFs skip the 64-byte-aligned wide allocation altogether.
384 let narrow_cap = narrow.max(4096);
385 let wide_cap = if any_wide { wide.max(64) } else { 0 };
386 (narrow_cap, wide_cap)
387}
388
389pub fn build_ir(ir: &air::Ir, top: StrId, config: &Config) -> Result<Ir, SimulatorError> {
390 for x in &ir.components {
391 if let air::Component::Module(x) = x
392 && top == x.name
393 {
394 let token = x.token;
395 let mut context = context::Context {
396 config: config.clone(),
397 backends: BackendRegistry::for_config(config),
398 ..Default::default()
399 };
400 let proto: ProtoModule = Conv::conv(&mut context, x)?;
401 let module = proto.instantiate();
402 return Ok(Ir::from_module(module, config, token));
403 }
404 }
405 Err(SimulatorError::TopModuleNotFound {
406 module_name: top.to_string(),
407 })
408}
409
410struct CacheEntry {
411 proto: ProtoModule,
412 token: TokenRange,
413}
414
415/// Cache for `ProtoModule` keyed by top module name. JIT binaries are
416/// kept alive via shared `Arc<ChunkArtifact>` handles embedded in the
417/// cached `ProtoModule`'s `CompiledBlock` statements, so the cache no
418/// longer needs a separate keepalive vector.
419#[derive(Default)]
420pub struct ProtoModuleCache {
421 entries: HashMap<StrId, CacheEntry>,
422}
423
424pub fn build_ir_cached(
425 ir: &air::Ir,
426 top: StrId,
427 config: &Config,
428 cache: &mut ProtoModuleCache,
429) -> Result<Ir, SimulatorError> {
430 // Cache hit: reuse ProtoModule, just instantiate with fresh buffers
431 if let Some(entry) = cache.entries.get(&top) {
432 let module = entry.proto.instantiate();
433 return Ok(Ir::from_module(module, config, entry.token));
434 }
435
436 // Cache miss: run Conv::conv
437 for x in &ir.components {
438 if let air::Component::Module(x) = x
439 && top == x.name
440 {
441 let token = x.token;
442 let mut context = context::Context {
443 config: config.clone(),
444 backends: BackendRegistry::for_config(config),
445 ..Default::default()
446 };
447
448 let proto: ProtoModule = Conv::conv(&mut context, x)?;
449 let module = proto.instantiate();
450
451 let result = Ir::from_module(module, config, token);
452
453 cache.entries.insert(top, CacheEntry { proto, token });
454
455 return Ok(result);
456 }
457 }
458 Err(SimulatorError::TopModuleNotFound {
459 module_name: top.to_string(),
460 })
461}
462
463#[derive(Clone, Debug, Default)]
464pub struct Config {
465 pub use_4state: bool,
466 pub use_jit: bool,
467 pub dump_cranelift: bool,
468 pub dump_asm: bool,
469 /// Force all always_ff variables to FF (disable is_ff refinement).
470 pub disable_ff_opt: bool,
471 /// `cc` backend: emit comb as C, compile externally, and dispatch the
472 /// `.so` instead of the Cranelift loop (which still covers stmts it can't
473 /// emit, so keep `use_jit` true). Default false; `--backend cc` enables it.
474 pub aot_c: bool,
475 /// `cc` backend event path: also emit the per-event FF-next + write-log.
476 /// Requires `aot_c`.
477 pub aot_c_event: bool,
478 /// Compile the `.so` on a background thread and hot-swap from Cranelift
479 /// once ready, hiding the cold compile latency. Requires `aot_c`; forced
480 /// off under `aot_c_validate` (validation must dual-run from cycle 0).
481 pub aot_c_async: bool,
482 /// Dual-run `cc` and Cranelift every cycle, panicking on the first
483 /// divergence (correctness check). Implies a synchronous compile.
484 pub aot_c_validate: bool,
485 /// Minimum module statement count (comb + event) before `cc` is attempted.
486 /// The external compile is a fixed per-module cost; on tiny modules it is
487 /// pure overhead and floods the host across the fast suite. Default 0 (no
488 /// floor, so tests exercise the path); `--backend cc` raises it to 256.
489 pub aot_c_min_stmts: usize,
490}
491
492impl Config {
493 /// Apply environment-variable overrides on top of an existing config.
494 pub fn apply_env(&mut self) {
495 if std::env::var("VERYL_DUMP_ASM").ok().as_deref() == Some("1") {
496 self.dump_asm = true;
497 }
498 if std::env::var("VERYL_DUMP_CRANELIFT").ok().as_deref() == Some("1") {
499 self.dump_cranelift = true;
500 }
501 // AOT-C ("cc" backend) env overrides. The CLI `--backend` is the
502 // primary control; these let callers force a sub-feature on/off (e.g.
503 // bisect a divergence, or disable async for a deterministic profile)
504 // without a flag. `=1` enables, `=0` disables; anything else leaves
505 // the value untouched.
506 let env_bool = |k: &str| match std::env::var(k).ok().as_deref() {
507 Some("1") => Some(true),
508 Some("0") => Some(false),
509 _ => None,
510 };
511 if let Some(v) = env_bool("VERYL_AOT_C") {
512 self.aot_c = v;
513 }
514 if let Some(v) = env_bool("VERYL_AOT_C_EVENT") {
515 self.aot_c_event = v;
516 }
517 if let Some(v) = env_bool("VERYL_AOT_C_ASYNC") {
518 self.aot_c_async = v;
519 }
520 if let Some(v) = env_bool("VERYL_AOT_C_VALIDATE") {
521 self.aot_c_validate = v;
522 }
523 if let Ok(n) = std::env::var("VERYL_AOT_C_MIN_STMTS")
524 && let Ok(n) = n.parse::<usize>()
525 {
526 self.aot_c_min_stmts = n;
527 }
528 }
529}
530
531// `cc_available()` has moved to `crate::backend::aot_c`.
532
533impl Config {
534 pub fn all() -> Vec<Config> {
535 let mut ret = vec![];
536
537 // `use_jit = true` is meaningful only when the Cranelift backend
538 // is built in; wasm has no chunk backend, so dropping the `true`
539 // arm is purely an optimization (Config::default() already sets
540 // use_jit = false).
541 let jit_options: &[bool] = if cfg!(target_family = "wasm") {
542 &[false]
543 } else {
544 &[false, true]
545 };
546
547 for use_4state in [false, true] {
548 for &use_jit in jit_options {
549 for disable_ff_opt in [false, true] {
550 ret.push(Config {
551 use_4state,
552 use_jit,
553 disable_ff_opt,
554 ..Default::default()
555 });
556 }
557 }
558 }
559
560 // `cc` backend variants: 2-state only, Cranelift fallback for uncovered
561 // stmts (use_jit stays true). Sync compile — async's swap point varies
562 // with timing, but tests must dual-check cc deterministically vs the
563 // golden output. Gated on cc_available so cc-less hosts still run.
564 #[cfg(not(target_family = "wasm"))]
565 if backend::aot_c::cc_available() {
566 for disable_ff_opt in [false, true] {
567 ret.push(Config {
568 use_4state: false,
569 use_jit: true,
570 disable_ff_opt,
571 aot_c: true,
572 aot_c_event: true,
573 aot_c_async: false,
574 ..Default::default()
575 });
576 }
577 }
578
579 ret
580 }
581}
582
583// `lookup_comb_offset` has moved to `backend::validate`.