miden_processor/fast/mod.rs
1use alloc::{boxed::Box, sync::Arc, vec::Vec};
2use core::cmp::min;
3
4use memory::Memory;
5use miden_air::RowIndex;
6use miden_core::{
7 Decorator, DecoratorIterator, EMPTY_WORD, Felt, Kernel, ONE, Operation, Program, StackOutputs,
8 WORD_SIZE, Word, ZERO,
9 mast::{
10 BasicBlockNode, CallNode, DynNode, ExternalNode, JoinNode, LoopNode, MastForest, MastNode,
11 MastNodeId, OP_GROUP_SIZE, OpBatch, SplitNode,
12 },
13 stack::MIN_STACK_DEPTH,
14 utils::range,
15};
16use miden_debug_types::{DefaultSourceManager, SourceManager};
17
18use crate::{
19 AdviceInputs, AdviceProvider, AsyncHost, ContextId, ErrorContext, ExecutionError, FMP_MIN,
20 ProcessState, SYSCALL_FMP_MIN, add_error_ctx_to_external_error, chiplets::Ace, err_ctx,
21 utils::resolve_external_node_async,
22};
23
24mod memory;
25
26// Ops
27mod circuit_eval;
28mod crypto_ops;
29mod field_ops;
30mod fri_ops;
31mod horner_ops;
32mod io_ops;
33mod stack_ops;
34mod sys_ops;
35mod u32_ops;
36
37#[cfg(test)]
38mod tests;
39
40/// The size of the stack buffer.
41///
42/// Note: This value is much larger than it needs to be for the majority of programs. However, some
43/// existing programs need it (e.g. `std::math::secp256k1::group::gen_mul`), so we're forced to push
44/// it up. At this high a value, we're starting to see some performance degradation on benchmarks.
45/// For example, the blake3 benchmark went from 285 MHz to 250 MHz (~10% degradation). Perhaps a
46/// better solution would be to make this value much smaller (~1000), and then fallback to a `Vec`
47/// if the stack overflows.
48const STACK_BUFFER_SIZE: usize = 6650;
49
50/// The initial position of the top of the stack in the stack buffer.
51///
52/// We place this value close to 0 because if a program hits the limit, it's much more likely to hit
53/// the upper bound than the lower bound, since hitting the lower bound only occurs when you drop
54/// 0's that were generated automatically to keep the stack depth at 16. In practice, if this
55/// occurs, it is most likely a bug.
56const INITIAL_STACK_TOP_IDX: usize = 50;
57
58/// WORD_SIZE, but as a `Felt`.
59const WORD_SIZE_FELT: Felt = Felt::new(4);
60
61/// The size of a double-word.
62const DOUBLE_WORD_SIZE: Felt = Felt::new(8);
63
64/// A fast processor which doesn't generate any trace.
65///
66/// This processor is designed to be as fast as possible. Hence, it only keeps track of the current
67/// state of the processor (i.e. the stack, current clock cycle, current memory context, and free
68/// memory pointer).
69///
70/// # Stack Management
71/// A few key points about how the stack was designed for maximum performance:
72///
73/// - The stack has a fixed buffer size defined by `STACK_BUFFER_SIZE`.
74/// - This was observed to increase performance by at least 2x compared to using a `Vec` with
75/// `push()` & `pop()`.
76/// - We track the stack top and bottom using indices `stack_top_idx` and `stack_bot_idx`,
77/// respectively.
78/// - Since we are using a fixed-size buffer, we need to ensure that stack buffer accesses are not
79/// out of bounds. Naively, we could check for this on every access. However, every operation
80/// alters the stack depth by a predetermined amount, allowing us to precisely determine the
81/// minimum number of operations required to reach a stack buffer boundary, whether at the top or
82/// bottom.
83/// - For example, if the stack top is 10 elements away from the top boundary, and the stack
84/// bottom is 15 elements away from the bottom boundary, then we can safely execute 10
85/// operations that modify the stack depth with no bounds check.
86/// - When switching contexts (e.g., during a call or syscall), all elements past the first 16 are
87/// stored in an `ExecutionContextInfo` struct, and the stack is truncated to 16 elements. This
88/// will be restored when returning from the call or syscall.
89///
90/// # Clock Cycle Management
91/// - The clock cycle (`clk`) is managed in the same way as in `Process`. That is, it is incremented
92/// by 1 for every row that `Process` adds to the main trace.
93/// - It is important to do so because the clock cycle is used to determine the context ID for
94/// new execution contexts when using `call` or `dyncall`.
95/// - When executing a basic block, the clock cycle is not incremented for every individual
96/// operation for performance reasons.
97/// - Rather, we use `clk + operation_index` to determine the clock cycle when needed.
98/// - However this performance improvement is slightly offset by the need to parse operation
99/// batches exactly the same as `Process`. We will be able to recover the performance loss by
100/// redesigning the `BasicBlockNode`.
101#[derive(Debug)]
102pub struct FastProcessor {
103 /// The stack is stored in reverse order, so that the last element is at the top of the stack.
104 pub(super) stack: [Felt; STACK_BUFFER_SIZE],
105 /// The index of the top of the stack.
106 stack_top_idx: usize,
107 /// The index of the bottom of the stack.
108 stack_bot_idx: usize,
109 /// The counter which keeps track of the number of instructions that we can execute without
110 /// hitting the bounds of `stack`.
111 bounds_check_counter: usize,
112
113 /// The current clock cycle.
114 ///
115 /// However, when we are in a basic block, this corresponds to the clock cycle at which the
116 /// basic block was entered. Hence, given an operation, we need to add its index in the
117 /// block to this value to get the clock cycle.
118 pub(super) clk: RowIndex,
119
120 /// The current context ID.
121 pub(super) ctx: ContextId,
122
123 /// The free memory pointer.
124 pub(super) fmp: Felt,
125
126 /// Whether we are currently in a syscall.
127 in_syscall: bool,
128
129 /// The hash of the function that called into the current context, or `[ZERO, ZERO, ZERO,
130 /// ZERO]` if we are in the first context (i.e. when `call_stack` is empty).
131 pub(super) caller_hash: Word,
132
133 /// The advice provider to be used during execution.
134 pub(super) advice: AdviceProvider,
135
136 /// A map from (context_id, word_address) to the word stored starting at that memory location.
137 pub(super) memory: Memory,
138
139 /// A map storing metadata per call to the ACE chiplet.
140 pub(super) ace: Ace,
141
142 /// The call stack is used when starting a new execution context (from a `call`, `syscall` or
143 /// `dyncall`) to keep track of the information needed to return to the previous context upon
144 /// return. It is a stack since calls can be nested.
145 call_stack: Vec<ExecutionContextInfo>,
146
147 /// Whether to enable debug statements and tracing.
148 in_debug_mode: bool,
149
150 /// The source manager (providing information about the location of each instruction).
151 source_manager: Arc<dyn SourceManager>,
152}
153
154impl FastProcessor {
155 // CONSTRUCTORS
156 // ----------------------------------------------------------------------------------------------
157
158 /// Creates a new `FastProcessor` instance with the given stack inputs.
159 ///
160 /// # Panics
161 /// - Panics if the length of `stack_inputs` is greater than `MIN_STACK_DEPTH`.
162 pub fn new(stack_inputs: &[Felt]) -> Self {
163 Self::initialize(stack_inputs, AdviceInputs::default(), false)
164 }
165
166 /// Creates a new `FastProcessor` instance with the given stack and advice inputs.
167 ///
168 /// # Panics
169 /// - Panics if the length of `stack_inputs` is greater than `MIN_STACK_DEPTH`.
170 pub fn new_with_advice_inputs(stack_inputs: &[Felt], advice_inputs: AdviceInputs) -> Self {
171 Self::initialize(stack_inputs, advice_inputs, false)
172 }
173
174 /// Creates a new `FastProcessor` instance, set to debug mode, with the given stack
175 /// and advice inputs.
176 ///
177 /// # Panics
178 /// - Panics if the length of `stack_inputs` is greater than `MIN_STACK_DEPTH`.
179 pub fn new_debug(stack_inputs: &[Felt], advice_inputs: AdviceInputs) -> Self {
180 Self::initialize(stack_inputs, advice_inputs, true)
181 }
182
183 /// Generic constructor unifying the above public ones.
184 ///
185 /// The stack inputs are expected to be stored in reverse order. For example, if `stack_inputs =
186 /// [1,2,3]`, then the stack will be initialized as `[3,2,1,0,0,...]`, with `3` being on
187 /// top.
188 fn initialize(stack_inputs: &[Felt], advice_inputs: AdviceInputs, in_debug_mode: bool) -> Self {
189 assert!(stack_inputs.len() <= MIN_STACK_DEPTH);
190
191 let stack_top_idx = INITIAL_STACK_TOP_IDX;
192 let stack = {
193 let mut stack = [ZERO; STACK_BUFFER_SIZE];
194 let bottom_idx = stack_top_idx - stack_inputs.len();
195
196 stack[bottom_idx..stack_top_idx].copy_from_slice(stack_inputs);
197 stack
198 };
199
200 let stack_bot_idx = stack_top_idx - MIN_STACK_DEPTH;
201
202 let bounds_check_counter = stack_bot_idx;
203 let source_manager = Arc::new(DefaultSourceManager::default());
204 Self {
205 advice: advice_inputs.into(),
206 stack,
207 stack_top_idx,
208 stack_bot_idx,
209 bounds_check_counter,
210 clk: 0_u32.into(),
211 ctx: 0_u32.into(),
212 fmp: Felt::new(FMP_MIN),
213 in_syscall: false,
214 caller_hash: EMPTY_WORD,
215 memory: Memory::new(),
216 call_stack: Vec::new(),
217 ace: Ace::default(),
218 in_debug_mode,
219 source_manager,
220 }
221 }
222
223 /// Set the internal source manager to an externally initialized one.
224 pub fn with_source_manager(mut self, source_manager: Arc<dyn SourceManager>) -> Self {
225 self.source_manager = source_manager;
226 self
227 }
228
229 // ACCESSORS
230 // -------------------------------------------------------------------------------------------
231
232 /// Returns the stack, such that the top of the stack is at the last index of the returned
233 /// slice.
234 pub fn stack(&self) -> &[Felt] {
235 &self.stack[self.stack_bot_idx..self.stack_top_idx]
236 }
237
238 /// Returns the element on the stack at index `idx`.
239 #[inline(always)]
240 pub fn stack_get(&self, idx: usize) -> Felt {
241 self.stack[self.stack_top_idx - idx - 1]
242 }
243
244 /// Mutable variant of `stack_get()`.
245 #[inline(always)]
246 pub fn stack_get_mut(&mut self, idx: usize) -> &mut Felt {
247 &mut self.stack[self.stack_top_idx - idx - 1]
248 }
249
250 /// Returns the word on the stack starting at index `start_idx` in "stack order".
251 ///
252 /// That is, for `start_idx=0` the top element of the stack will be at the last position in the
253 /// word.
254 ///
255 /// For example, if the stack looks like this:
256 ///
257 /// top bottom
258 /// v v
259 /// a | b | c | d | e | f | g | h | i | j | k | l | m | n | o | p
260 ///
261 /// Then
262 /// - `stack_get_word(0)` returns `[d, c, b, a]`,
263 /// - `stack_get_word(1)` returns `[e, d, c ,b]`,
264 /// - etc.
265 #[inline(always)]
266 pub fn stack_get_word(&self, start_idx: usize) -> Word {
267 debug_assert!(start_idx < MIN_STACK_DEPTH);
268
269 let word_start_idx = self.stack_top_idx - start_idx - 4;
270 let result: [Felt; WORD_SIZE] =
271 self.stack[range(word_start_idx, WORD_SIZE)].try_into().unwrap();
272 result.into()
273 }
274
275 /// Returns the number of elements on the stack in the current context.
276 #[inline(always)]
277 pub fn stack_depth(&self) -> u32 {
278 (self.stack_top_idx - self.stack_bot_idx) as u32
279 }
280
281 // MUTATORS
282 // -------------------------------------------------------------------------------------------
283
284 /// Writes an element to the stack at the given index.
285 #[inline(always)]
286 pub fn stack_write(&mut self, idx: usize, element: Felt) {
287 self.stack[self.stack_top_idx - idx - 1] = element
288 }
289
290 /// Writes a word to the stack starting at the given index.
291 ///
292 /// The index is the index of the first element of the word, and the word is written in reverse
293 /// order.
294 #[inline(always)]
295 pub fn stack_write_word(&mut self, start_idx: usize, word: &Word) {
296 debug_assert!(start_idx < MIN_STACK_DEPTH);
297
298 let word_start_idx = self.stack_top_idx - start_idx - 4;
299 let source: [Felt; WORD_SIZE] = (*word).into();
300 self.stack[range(word_start_idx, WORD_SIZE)].copy_from_slice(&source)
301 }
302
303 /// Swaps the elements at the given indices on the stack.
304 #[inline(always)]
305 pub fn stack_swap(&mut self, idx1: usize, idx2: usize) {
306 let a = self.stack_get(idx1);
307 let b = self.stack_get(idx2);
308 self.stack_write(idx1, b);
309 self.stack_write(idx2, a);
310 }
311
312 // EXECUTE
313 // -------------------------------------------------------------------------------------------
314
315 /// Executes the given program and returns the stack outputs.
316 pub async fn execute(
317 mut self,
318 program: &Program,
319 host: &mut impl AsyncHost,
320 ) -> Result<StackOutputs, ExecutionError> {
321 self.execute_impl(program, host).await
322 }
323
324 /// Executes the given program and returns the stack outputs.
325 ///
326 /// This function is mainly split out of `execute()` for testing purposes so that we can access
327 /// the internal state of the `FastProcessor` after execution.
328 async fn execute_impl(
329 &mut self,
330 program: &Program,
331 host: &mut impl AsyncHost,
332 ) -> Result<StackOutputs, ExecutionError> {
333 self.execute_mast_node(program.entrypoint(), program.mast_forest(), program.kernel(), host)
334 .await?;
335
336 StackOutputs::new(
337 self.stack[self.stack_bot_idx..self.stack_top_idx]
338 .iter()
339 .rev()
340 .copied()
341 .collect(),
342 )
343 .map_err(|_| {
344 ExecutionError::OutputStackOverflow(
345 self.stack_top_idx - self.stack_bot_idx - MIN_STACK_DEPTH,
346 )
347 })
348 }
349
350 // NODE EXECUTORS
351 // --------------------------------------------------------------------------------------------
352
353 async fn execute_mast_node(
354 &mut self,
355 node_id: MastNodeId,
356 program: &MastForest,
357 kernel: &Kernel,
358 host: &mut impl AsyncHost,
359 ) -> Result<(), ExecutionError> {
360 let node = program
361 .get_node_by_id(node_id)
362 .ok_or(ExecutionError::MastNodeNotFoundInForest { node_id })?;
363
364 // Note: we only run this in case there are Trace events associated with the node. However,
365 // if there are assembly ops in the "before enter" list, we will cycle through them and
366 // ignore them, resulting in a drop of performance. We should remove this after trace events
367 // are removed from decorators - or if decorators are removed entirely.
368 //
369 // A similar reasoning applies to the "after exit" list.
370 for &decorator_id in node.before_enter() {
371 self.execute_decorator(&program[decorator_id], 0, host)?;
372 }
373
374 match node {
375 MastNode::Block(basic_block_node) => {
376 self.execute_basic_block_node(basic_block_node, program, host).await?
377 },
378 MastNode::Join(join_node) => {
379 Box::pin(self.execute_join_node(join_node, program, kernel, host)).await?
380 },
381 MastNode::Split(split_node) => {
382 Box::pin(self.execute_split_node(split_node, program, kernel, host)).await?
383 },
384 MastNode::Loop(loop_node) => {
385 Box::pin(self.execute_loop_node(loop_node, program, kernel, host)).await?
386 },
387 MastNode::Call(call_node) => {
388 let err_ctx = err_ctx!(program, call_node, self.source_manager.clone());
389 add_error_ctx_to_external_error(
390 Box::pin(self.execute_call_node(call_node, program, kernel, host)).await,
391 err_ctx,
392 )?
393 },
394 MastNode::Dyn(dyn_node) => {
395 let err_ctx = err_ctx!(program, dyn_node, self.source_manager.clone());
396 add_error_ctx_to_external_error(
397 Box::pin(self.execute_dyn_node(dyn_node, program, kernel, host)).await,
398 err_ctx,
399 )?
400 },
401 MastNode::External(external_node) => {
402 Box::pin(self.execute_external_node(external_node, kernel, host)).await?
403 },
404 }
405
406 for &decorator_id in node.after_exit() {
407 self.execute_decorator(&program[decorator_id], 0, host)?;
408 }
409
410 Ok(())
411 }
412
413 async fn execute_join_node(
414 &mut self,
415 join_node: &JoinNode,
416 program: &MastForest,
417 kernel: &Kernel,
418 host: &mut impl AsyncHost,
419 ) -> Result<(), ExecutionError> {
420 // Corresponds to the row inserted for the JOIN operation added to the trace.
421 self.clk += 1_u32;
422
423 self.execute_mast_node(join_node.first(), program, kernel, host).await?;
424 self.execute_mast_node(join_node.second(), program, kernel, host).await?;
425
426 // Corresponds to the row inserted for the END operation added to the trace.
427 self.clk += 1_u32;
428
429 Ok(())
430 }
431
432 async fn execute_split_node(
433 &mut self,
434 split_node: &SplitNode,
435 program: &MastForest,
436 kernel: &Kernel,
437 host: &mut impl AsyncHost,
438 ) -> Result<(), ExecutionError> {
439 // Corresponds to the row inserted for the SPLIT operation added to the trace.
440 self.clk += 1_u32;
441
442 let condition = self.stack_get(0);
443
444 // drop the condition from the stack
445 self.decrement_stack_size();
446
447 // execute the appropriate branch
448 let ret = if condition == ONE {
449 self.execute_mast_node(split_node.on_true(), program, kernel, host).await
450 } else if condition == ZERO {
451 self.execute_mast_node(split_node.on_false(), program, kernel, host).await
452 } else {
453 let err_ctx = err_ctx!(program, split_node, self.source_manager.clone());
454 Err(ExecutionError::not_binary_value_if(condition, &err_ctx))
455 };
456
457 // Corresponds to the row inserted for the END operation added to the trace.
458 self.clk += 1_u32;
459
460 ret
461 }
462
463 async fn execute_loop_node(
464 &mut self,
465 loop_node: &LoopNode,
466 program: &MastForest,
467 kernel: &Kernel,
468 host: &mut impl AsyncHost,
469 ) -> Result<(), ExecutionError> {
470 // Corresponds to the row inserted for the LOOP operation added to the trace.
471 self.clk += 1_u32;
472
473 // The loop condition is checked after the loop body is executed.
474 let mut condition = self.stack_get(0);
475
476 // drop the condition from the stack
477 self.decrement_stack_size();
478
479 // execute the loop body as long as the condition is true
480 while condition == ONE {
481 self.execute_mast_node(loop_node.body(), program, kernel, host).await?;
482
483 // check the loop condition, and drop it from the stack
484 condition = self.stack_get(0);
485 self.decrement_stack_size();
486
487 // this clock increment is for the row inserted for the `REPEAT` node added to
488 // the trace on each iteration. It needs to be at the end of this loop (instead
489 // of at the beginning), otherwise we get an off-by-one error when comparing
490 // with [crate::Process].
491 if condition == ONE {
492 self.clk += 1_u32;
493 }
494 }
495
496 // Corresponds to the row inserted for the END operation added to the trace.
497 self.clk += 1_u32;
498
499 if condition == ZERO {
500 Ok(())
501 } else {
502 let err_ctx = err_ctx!(program, loop_node, self.source_manager.clone());
503 Err(ExecutionError::not_binary_value_loop(condition, &err_ctx))
504 }
505 }
506
507 async fn execute_call_node(
508 &mut self,
509 call_node: &CallNode,
510 program: &MastForest,
511 kernel: &Kernel,
512 host: &mut impl AsyncHost,
513 ) -> Result<(), ExecutionError> {
514 let err_ctx = err_ctx!(program, call_node, self.source_manager.clone());
515
516 // Corresponds to the row inserted for the CALL or SYSCALL operation added to the trace.
517 self.clk += 1_u32;
518
519 // call or syscall are not allowed inside a syscall
520 if self.in_syscall {
521 let instruction = if call_node.is_syscall() { "syscall" } else { "call" };
522 return Err(ExecutionError::CallInSyscall(instruction));
523 }
524
525 let callee_hash = program
526 .get_node_by_id(call_node.callee())
527 .ok_or(ExecutionError::MastNodeNotFoundInForest { node_id: call_node.callee() })?
528 .digest();
529
530 self.save_context_and_truncate_stack();
531
532 if call_node.is_syscall() {
533 // check if the callee is in the kernel
534 if !kernel.contains_proc(callee_hash) {
535 return Err(ExecutionError::syscall_target_not_in_kernel(callee_hash, &err_ctx));
536 }
537
538 // set the system registers to the syscall context
539 self.ctx = ContextId::root();
540 self.fmp = SYSCALL_FMP_MIN.into();
541 self.in_syscall = true;
542 } else {
543 // set the system registers to the callee context
544 self.ctx = self.clk.into();
545 self.fmp = Felt::new(FMP_MIN);
546 self.caller_hash = callee_hash;
547 }
548
549 // Execute the callee.
550 self.execute_mast_node(call_node.callee(), program, kernel, host).await?;
551
552 // when returning from a function call or a syscall, restore the context of the
553 // system registers and the operand stack to what it was prior to
554 // the call.
555 self.restore_context(&err_ctx)?;
556
557 // Corresponds to the row inserted for the END operation added to the trace.
558 self.clk += 1_u32;
559
560 Ok(())
561 }
562
563 async fn execute_dyn_node(
564 &mut self,
565 dyn_node: &DynNode,
566 program: &MastForest,
567 kernel: &Kernel,
568 host: &mut impl AsyncHost,
569 ) -> Result<(), ExecutionError> {
570 // Corresponds to the row inserted for the DYN or DYNCALL operation added to the trace.
571 self.clk += 1_u32;
572
573 // dyn calls are not allowed inside a syscall
574 if dyn_node.is_dyncall() && self.in_syscall {
575 return Err(ExecutionError::CallInSyscall("dyncall"));
576 }
577
578 let err_ctx = err_ctx!(program, dyn_node, self.source_manager.clone());
579
580 // Retrieve callee hash from memory, using stack top as the memory address.
581 let callee_hash = {
582 let mem_addr = self.stack_get(0);
583 self.memory
584 .read_word(self.ctx, mem_addr, self.clk, &err_ctx)
585 .map_err(ExecutionError::MemoryError)?
586 };
587
588 // Drop the memory address from the stack. This needs to be done BEFORE saving the context,
589 // because the next instruction starts with a "shifted left" stack.
590 self.decrement_stack_size();
591
592 // For dyncall, save the context and reset it.
593 if dyn_node.is_dyncall() {
594 self.save_context_and_truncate_stack();
595 self.ctx = self.clk.into();
596 self.fmp = Felt::new(FMP_MIN);
597 self.caller_hash = callee_hash;
598 };
599
600 // if the callee is not in the program's MAST forest, try to find a MAST forest for it in
601 // the host (corresponding to an external library loaded in the host); if none are
602 // found, return an error.
603 match program.find_procedure_root(callee_hash) {
604 Some(callee_id) => self.execute_mast_node(callee_id, program, kernel, host).await?,
605 None => {
606 let mast_forest = host
607 .get_mast_forest(&callee_hash)
608 .await
609 .ok_or_else(|| ExecutionError::dynamic_node_not_found(callee_hash, &err_ctx))?;
610
611 // We limit the parts of the program that can be called externally to procedure
612 // roots, even though MAST doesn't have that restriction.
613 let root_id = mast_forest
614 .find_procedure_root(callee_hash)
615 .ok_or(ExecutionError::malfored_mast_forest_in_host(callee_hash, &err_ctx))?;
616
617 self.execute_mast_node(root_id, &mast_forest, kernel, host).await?
618 },
619 }
620
621 // For dyncall, restore the context.
622 if dyn_node.is_dyncall() {
623 self.restore_context(&err_ctx)?;
624 }
625
626 // Corresponds to the row inserted for the END operation added to the trace.
627 self.clk += 1_u32;
628
629 Ok(())
630 }
631
632 async fn execute_external_node(
633 &mut self,
634 external_node: &ExternalNode,
635 kernel: &Kernel,
636 host: &mut impl AsyncHost,
637 ) -> Result<(), ExecutionError> {
638 let (root_id, mast_forest) =
639 resolve_external_node_async(external_node, &mut self.advice, host).await?;
640
641 self.execute_mast_node(root_id, &mast_forest, kernel, host).await
642 }
643
644 // Note: when executing individual ops, we do not increment the clock by 1 at every iteration
645 // for performance reasons (~25% performance drop). Hence, `self.clk` cannot be used directly to
646 // determine the number of operations executed in a program.
647 #[inline(always)]
648 async fn execute_basic_block_node(
649 &mut self,
650 basic_block_node: &BasicBlockNode,
651 program: &MastForest,
652 host: &mut impl AsyncHost,
653 ) -> Result<(), ExecutionError> {
654 // Corresponds to the row inserted for the SPAN operation added to the trace.
655 self.clk += 1_u32;
656
657 let mut batch_offset_in_block = 0;
658 let mut op_batches = basic_block_node.op_batches().iter();
659 let mut decorator_ids = basic_block_node.decorator_iter();
660
661 // execute first op batch
662 if let Some(first_op_batch) = op_batches.next() {
663 self.execute_op_batch(
664 basic_block_node,
665 first_op_batch,
666 &mut decorator_ids,
667 batch_offset_in_block,
668 program,
669 host,
670 )
671 .await?;
672 batch_offset_in_block += first_op_batch.ops().len();
673 }
674
675 // execute the rest of the op batches
676 for op_batch in op_batches {
677 // increment clock to account for `RESPAN`
678 self.clk += 1_u32;
679
680 self.execute_op_batch(
681 basic_block_node,
682 op_batch,
683 &mut decorator_ids,
684 batch_offset_in_block,
685 program,
686 host,
687 )
688 .await?;
689 batch_offset_in_block += op_batch.ops().len();
690 }
691
692 // update clock with all the operations that executed
693 self.clk += batch_offset_in_block as u32;
694
695 // Corresponds to the row inserted for the END operation added to the trace.
696 self.clk += 1_u32;
697
698 // execute any decorators which have not been executed during span ops execution; this can
699 // happen for decorators appearing after all operations in a block. these decorators are
700 // executed after SPAN block is closed to make sure the VM clock cycle advances beyond the
701 // last clock cycle of the SPAN block ops.
702 for &decorator_id in decorator_ids {
703 let decorator = program
704 .get_decorator_by_id(decorator_id)
705 .ok_or(ExecutionError::DecoratorNotFoundInForest { decorator_id })?;
706 self.execute_decorator(decorator, 0, host)?;
707 }
708
709 Ok(())
710 }
711
712 #[inline(always)]
713 async fn execute_op_batch(
714 &mut self,
715 basic_block: &BasicBlockNode,
716 batch: &OpBatch,
717 decorators: &mut DecoratorIterator<'_>,
718 batch_offset_in_block: usize,
719 program: &MastForest,
720 host: &mut impl AsyncHost,
721 ) -> Result<(), ExecutionError> {
722 let op_counts = batch.op_counts();
723 let mut op_idx_in_group = 0;
724 let mut group_idx = 0;
725 let mut next_group_idx = 1;
726
727 // round up the number of groups to be processed to the next power of two; we do this
728 // because the processor requires the number of groups to be either 1, 2, 4, or 8; if
729 // the actual number of groups is smaller, we'll pad the batch with NOOPs at the end
730 let num_batch_groups = batch.num_groups().next_power_of_two();
731
732 // execute operations in the batch one by one
733 for (op_idx_in_batch, op) in batch.ops().iter().enumerate() {
734 while let Some(&decorator_id) =
735 decorators.next_filtered(batch_offset_in_block + op_idx_in_batch)
736 {
737 let decorator = program
738 .get_decorator_by_id(decorator_id)
739 .ok_or(ExecutionError::DecoratorNotFoundInForest { decorator_id })?;
740 self.execute_decorator(decorator, op_idx_in_batch, host)?;
741 }
742
743 // decode and execute the operation
744 let op_idx_in_block = batch_offset_in_block + op_idx_in_batch;
745 let err_ctx =
746 err_ctx!(program, basic_block, self.source_manager.clone(), op_idx_in_block);
747
748 // Execute the operation.
749 //
750 // Note: we handle the `Emit` operation separately, because it is an async operation,
751 // whereas all the other operations are synchronous (resulting in a significant
752 // performance improvement).
753 match op {
754 Operation::Emit(event_id) => {
755 self.op_emit(*event_id, op_idx_in_block, host, &err_ctx).await?
756 },
757 _ => {
758 // if the operation is not an Emit, we execute it normally
759 self.execute_op(op, op_idx_in_block, program, host, &err_ctx)?;
760 },
761 }
762
763 // if the operation carries an immediate value, the value is stored at the next group
764 // pointer; so, we advance the pointer to the following group
765 let has_imm = op.imm_value().is_some();
766 if has_imm {
767 next_group_idx += 1;
768 }
769
770 // determine if we've executed all non-decorator operations in a group
771 if op_idx_in_group == op_counts[group_idx] - 1 {
772 // if we are at the end of the group, first check if the operation carries an
773 // immediate value
774 if has_imm {
775 // an operation with an immediate value cannot be the last operation in a group
776 // so, we need execute a NOOP after it. In this processor, we increment the
777 // clock to account for the NOOP.
778 debug_assert!(op_idx_in_group < OP_GROUP_SIZE - 1, "invalid op index");
779 self.clk += 1_u32;
780 }
781
782 // then, move to the next group and reset operation index
783 group_idx = next_group_idx;
784 next_group_idx += 1;
785 op_idx_in_group = 0;
786 } else {
787 op_idx_in_group += 1;
788 }
789 }
790
791 // make sure we execute the required number of operation groups; this would happen when the
792 // actual number of operation groups was not a power of two. In this processor, this
793 // corresponds to incrementing the clock by the number of empty op groups (i.e. 1 NOOP
794 // executed per missing op group).
795
796 self.clk += (num_batch_groups - group_idx) as u32;
797
798 Ok(())
799 }
800
801 /// Executes the specified decorator
802 fn execute_decorator(
803 &mut self,
804 decorator: &Decorator,
805 op_idx_in_batch: usize,
806 host: &mut impl AsyncHost,
807 ) -> Result<(), ExecutionError> {
808 match decorator {
809 Decorator::Debug(options) => {
810 if self.in_debug_mode {
811 let process = &mut self.state(op_idx_in_batch);
812 host.on_debug(process, options)?;
813 }
814 },
815 Decorator::AsmOp(_assembly_op) => {
816 // do nothing
817 },
818 Decorator::Trace(id) => {
819 let process = &mut self.state(op_idx_in_batch);
820 host.on_trace(process, *id)?;
821 },
822 };
823 Ok(())
824 }
825
826 /// Executes the given operation.
827 ///
828 /// # Panics
829 /// - if the operation is a control flow operation, as these are never executed,
830 /// - if the operation is an `Emit` operation, as this requires async execution.
831 #[inline(always)]
832 fn execute_op(
833 &mut self,
834 operation: &Operation,
835 op_idx: usize,
836 program: &MastForest,
837 host: &mut impl AsyncHost,
838 err_ctx: &impl ErrorContext,
839 ) -> Result<(), ExecutionError> {
840 if self.bounds_check_counter == 0 {
841 let err_str = if self.stack_top_idx - MIN_STACK_DEPTH == 0 {
842 "stack underflow"
843 } else {
844 "stack overflow"
845 };
846 return Err(ExecutionError::FailedToExecuteProgram(err_str));
847 }
848
849 match operation {
850 // ----- system operations ------------------------------------------------------------
851 Operation::Noop => {
852 // do nothing
853 },
854 Operation::Assert(err_code) => {
855 self.op_assert(*err_code, op_idx, host, program, err_ctx)?
856 },
857 Operation::FmpAdd => self.op_fmpadd(),
858 Operation::FmpUpdate => self.op_fmpupdate()?,
859 Operation::SDepth => self.op_sdepth(),
860 Operation::Caller => self.op_caller()?,
861 Operation::Clk => self.op_clk(op_idx)?,
862 Operation::Emit(_event_id) => {
863 panic!("emit instruction requires async, so is not supported by execute_op()")
864 },
865
866 // ----- flow control operations ------------------------------------------------------
867 // control flow operations are never executed directly
868 Operation::Join => unreachable!("control flow operation"),
869 Operation::Split => unreachable!("control flow operation"),
870 Operation::Loop => unreachable!("control flow operation"),
871 Operation::Call => unreachable!("control flow operation"),
872 Operation::SysCall => unreachable!("control flow operation"),
873 Operation::Dyn => unreachable!("control flow operation"),
874 Operation::Dyncall => unreachable!("control flow operation"),
875 Operation::Span => unreachable!("control flow operation"),
876 Operation::Repeat => unreachable!("control flow operation"),
877 Operation::Respan => unreachable!("control flow operation"),
878 Operation::End => unreachable!("control flow operation"),
879 Operation::Halt => unreachable!("control flow operation"),
880
881 // ----- field operations -------------------------------------------------------------
882 Operation::Add => self.op_add()?,
883 Operation::Neg => self.op_neg()?,
884 Operation::Mul => self.op_mul()?,
885 Operation::Inv => self.op_inv(op_idx, err_ctx)?,
886 Operation::Incr => self.op_incr()?,
887 Operation::And => self.op_and(err_ctx)?,
888 Operation::Or => self.op_or(err_ctx)?,
889 Operation::Not => self.op_not(err_ctx)?,
890 Operation::Eq => self.op_eq()?,
891 Operation::Eqz => self.op_eqz()?,
892 Operation::Expacc => self.op_expacc(),
893 Operation::Ext2Mul => self.op_ext2mul(),
894
895 // ----- u32 operations ---------------------------------------------------------------
896 Operation::U32split => self.op_u32split(),
897 Operation::U32add => self.op_u32add(err_ctx)?,
898 Operation::U32add3 => self.op_u32add3(err_ctx)?,
899 Operation::U32sub => self.op_u32sub(op_idx, err_ctx)?,
900 Operation::U32mul => self.op_u32mul(err_ctx)?,
901 Operation::U32madd => self.op_u32madd(err_ctx)?,
902 Operation::U32div => self.op_u32div(op_idx, err_ctx)?,
903 Operation::U32and => self.op_u32and(err_ctx)?,
904 Operation::U32xor => self.op_u32xor(err_ctx)?,
905 Operation::U32assert2(err_code) => self.op_u32assert2(*err_code, err_ctx)?,
906
907 // ----- stack manipulation -----------------------------------------------------------
908 Operation::Pad => self.op_pad(),
909 Operation::Drop => self.decrement_stack_size(),
910 Operation::Dup0 => self.dup_nth(0),
911 Operation::Dup1 => self.dup_nth(1),
912 Operation::Dup2 => self.dup_nth(2),
913 Operation::Dup3 => self.dup_nth(3),
914 Operation::Dup4 => self.dup_nth(4),
915 Operation::Dup5 => self.dup_nth(5),
916 Operation::Dup6 => self.dup_nth(6),
917 Operation::Dup7 => self.dup_nth(7),
918 Operation::Dup9 => self.dup_nth(9),
919 Operation::Dup11 => self.dup_nth(11),
920 Operation::Dup13 => self.dup_nth(13),
921 Operation::Dup15 => self.dup_nth(15),
922 Operation::Swap => self.op_swap(),
923 Operation::SwapW => self.swapw_nth(1),
924 Operation::SwapW2 => self.swapw_nth(2),
925 Operation::SwapW3 => self.swapw_nth(3),
926 Operation::SwapDW => self.op_swap_double_word(),
927 Operation::MovUp2 => self.rotate_left(3),
928 Operation::MovUp3 => self.rotate_left(4),
929 Operation::MovUp4 => self.rotate_left(5),
930 Operation::MovUp5 => self.rotate_left(6),
931 Operation::MovUp6 => self.rotate_left(7),
932 Operation::MovUp7 => self.rotate_left(8),
933 Operation::MovUp8 => self.rotate_left(9),
934 Operation::MovDn2 => self.rotate_right(3),
935 Operation::MovDn3 => self.rotate_right(4),
936 Operation::MovDn4 => self.rotate_right(5),
937 Operation::MovDn5 => self.rotate_right(6),
938 Operation::MovDn6 => self.rotate_right(7),
939 Operation::MovDn7 => self.rotate_right(8),
940 Operation::MovDn8 => self.rotate_right(9),
941 Operation::CSwap => self.op_cswap(err_ctx)?,
942 Operation::CSwapW => self.op_cswapw(err_ctx)?,
943
944 // ----- input / output ---------------------------------------------------------------
945 Operation::Push(element) => self.op_push(*element),
946 Operation::AdvPop => self.op_advpop(op_idx, err_ctx)?,
947 Operation::AdvPopW => self.op_advpopw(op_idx, err_ctx)?,
948 Operation::MLoadW => self.op_mloadw(op_idx, err_ctx)?,
949 Operation::MStoreW => self.op_mstorew(op_idx, err_ctx)?,
950 Operation::MLoad => self.op_mload(err_ctx)?,
951 Operation::MStore => self.op_mstore(err_ctx)?,
952 Operation::MStream => self.op_mstream(op_idx, err_ctx)?,
953 Operation::Pipe => self.op_pipe(op_idx, err_ctx)?,
954
955 // ----- cryptographic operations -----------------------------------------------------
956 Operation::HPerm => self.op_hperm(),
957 Operation::MpVerify(err_code) => self.op_mpverify(*err_code, program, err_ctx)?,
958 Operation::MrUpdate => self.op_mrupdate(err_ctx)?,
959 Operation::FriE2F4 => self.op_fri_ext2fold4()?,
960 Operation::HornerBase => self.op_horner_eval_base(op_idx, err_ctx)?,
961 Operation::HornerExt => self.op_horner_eval_ext(op_idx, err_ctx)?,
962 Operation::ArithmeticCircuitEval => self.arithmetic_circuit_eval(op_idx, err_ctx)?,
963 }
964
965 Ok(())
966 }
967
968 // HELPERS
969 // ----------------------------------------------------------------------------------------------
970
971 /// Increments the stack top pointer by 1.
972 ///
973 /// The bottom of the stack is never affected by this operation.
974 #[inline(always)]
975 fn increment_stack_size(&mut self) {
976 self.stack_top_idx += 1;
977 self.update_bounds_check_counter();
978 }
979
980 /// Decrements the stack top pointer by 1.
981 ///
982 /// The bottom of the stack is only decremented in cases where the stack depth would become less
983 /// than 16.
984 #[inline(always)]
985 fn decrement_stack_size(&mut self) {
986 self.stack_top_idx -= 1;
987 self.stack_bot_idx = min(self.stack_bot_idx, self.stack_top_idx - MIN_STACK_DEPTH);
988 self.update_bounds_check_counter();
989 }
990
991 /// Returns the size of the stack.
992 #[inline(always)]
993 fn stack_size(&self) -> usize {
994 self.stack_top_idx - self.stack_bot_idx
995 }
996
997 /// Updates the bounds check counter.
998 ///
999 /// The bounds check counter is decremented by 1. If it reaches 0, it is reset to the minimum of
1000 /// the stack depth from the low end and the high end of the stack buffer.
1001 ///
1002 /// The purpose of the bounds check counter is to ensure that we never access the stack buffer
1003 /// at an out-of-bounds index.
1004 #[inline(always)]
1005 fn update_bounds_check_counter(&mut self) {
1006 self.bounds_check_counter -= 1;
1007
1008 if self.bounds_check_counter == 0 {
1009 // We will need to check the bounds either because we reach the low end or the high end
1010 // of the stack buffer. There are two worst cases that we are concerned about:
1011 // - we only execute instructions that decrease stack depth
1012 // - we only execute instructions that increase stack depth
1013 //
1014 // In the first case, we will hit the low end of the stack buffer; in the second case,
1015 // we will hit the high end of the stack buffer. We set the number of instructions that
1016 // is safe to execute to be the minimum of these two worst cases.
1017
1018 self.bounds_check_counter =
1019 min(self.stack_top_idx - MIN_STACK_DEPTH, STACK_BUFFER_SIZE - self.stack_top_idx);
1020 }
1021 }
1022
1023 /// Saves the current execution context and truncates the stack to 16 elements in preparation to
1024 /// start a new execution context.
1025 fn save_context_and_truncate_stack(&mut self) {
1026 let overflow_stack = if self.stack_size() > MIN_STACK_DEPTH {
1027 // save the overflow stack, and zero out the buffer.
1028 //
1029 // Note: we need to zero the overflow buffer, since the new context expects ZERO's to be
1030 // pulled in if they decrement the stack size (e.g. by executing a `drop`).
1031 let overflow_stack =
1032 self.stack[self.stack_bot_idx..self.stack_top_idx - MIN_STACK_DEPTH].to_vec();
1033 self.stack[self.stack_bot_idx..self.stack_top_idx - MIN_STACK_DEPTH].fill(ZERO);
1034
1035 overflow_stack
1036 } else {
1037 Vec::new()
1038 };
1039
1040 self.stack_bot_idx = self.stack_top_idx - MIN_STACK_DEPTH;
1041
1042 self.call_stack.push(ExecutionContextInfo {
1043 overflow_stack,
1044 ctx: self.ctx,
1045 fn_hash: self.caller_hash,
1046 fmp: self.fmp,
1047 });
1048 }
1049
1050 /// Restores the execution context to the state it was in before the last `call`, `syscall` or
1051 /// `dyncall`.
1052 ///
1053 /// This includes restoring the overflow stack and the system parameters.
1054 ///
1055 /// # Errors
1056 /// - Returns an error if the overflow stack is larger than the space available in the stack
1057 /// buffer.
1058 fn restore_context(&mut self, err_ctx: &impl ErrorContext) -> Result<(), ExecutionError> {
1059 // when a call/dyncall/syscall node ends, stack depth must be exactly 16.
1060 if self.stack_size() > MIN_STACK_DEPTH {
1061 return Err(ExecutionError::invalid_stack_depth_on_return(self.stack_size(), err_ctx));
1062 }
1063
1064 let ctx_info = self
1065 .call_stack
1066 .pop()
1067 .expect("execution context stack should never be empty when restoring context");
1068
1069 // restore the overflow stack
1070 {
1071 let overflow_len = ctx_info.overflow_stack.len();
1072 if overflow_len > self.stack_bot_idx {
1073 return Err(ExecutionError::FailedToExecuteProgram(
1074 "stack underflow when restoring context",
1075 ));
1076 }
1077
1078 self.stack[range(self.stack_bot_idx - overflow_len, overflow_len)]
1079 .copy_from_slice(&ctx_info.overflow_stack);
1080 self.stack_bot_idx -= overflow_len;
1081 }
1082
1083 // restore system parameters
1084 self.ctx = ctx_info.ctx;
1085 self.fmp = ctx_info.fmp;
1086 self.in_syscall = false;
1087 self.caller_hash = ctx_info.fn_hash;
1088
1089 Ok(())
1090 }
1091
1092 // TESTING
1093 // ----------------------------------------------------------------------------------------------
1094
1095 /// Convenience sync wrapper to [Self::execute] for testing purposes.
1096 #[cfg(any(test, feature = "testing"))]
1097 pub fn execute_sync(
1098 self,
1099 program: &Program,
1100 host: &mut impl AsyncHost,
1101 ) -> Result<StackOutputs, ExecutionError> {
1102 // Create a new Tokio runtime and block on the async execution
1103 let rt = tokio::runtime::Builder::new_current_thread().build().unwrap();
1104
1105 rt.block_on(self.execute(program, host))
1106 }
1107
1108 /// Similar to [Self::execute_sync], but allows mutable access to the processor.
1109 #[cfg(any(test, feature = "testing"))]
1110 pub fn execute_sync_mut(
1111 &mut self,
1112 program: &Program,
1113 host: &mut impl AsyncHost,
1114 ) -> Result<StackOutputs, ExecutionError> {
1115 // Create a new Tokio runtime and block on the async execution
1116 let rt = tokio::runtime::Builder::new_current_thread().build().unwrap();
1117
1118 rt.block_on(self.execute_impl(program, host))
1119 }
1120}
1121
1122#[derive(Debug)]
1123pub struct FastProcessState<'a> {
1124 pub(super) processor: &'a mut FastProcessor,
1125 /// the index of the operation in its basic block
1126 pub(super) op_idx: usize,
1127}
1128
1129impl FastProcessor {
1130 #[inline(always)]
1131 pub fn state(&mut self, op_idx: usize) -> ProcessState<'_> {
1132 ProcessState::Fast(FastProcessState { processor: self, op_idx })
1133 }
1134}
1135
1136// EXECUTION CONTEXT INFO
1137// ===============================================================================================
1138
1139/// Information about the execution context.
1140///
1141/// This struct is used to keep track of the information needed to return to the previous context
1142/// upon return from a `call`, `syscall` or `dyncall`.
1143#[derive(Debug)]
1144struct ExecutionContextInfo {
1145 /// This stores all the elements on the stack at the call site, excluding the top 16 elements.
1146 /// This corresponds to the overflow table in [crate::Process].
1147 overflow_stack: Vec<Felt>,
1148 ctx: ContextId,
1149 fn_hash: Word,
1150 fmp: Felt,
1151}