miden_processor/fast/
mod.rs

1use alloc::{boxed::Box, sync::Arc, vec::Vec};
2use core::cmp::min;
3
4use memory::Memory;
5use miden_air::RowIndex;
6use miden_core::{
7    Decorator, DecoratorIterator, EMPTY_WORD, Felt, Kernel, ONE, Operation, Program, StackOutputs,
8    WORD_SIZE, Word, ZERO,
9    mast::{
10        BasicBlockNode, CallNode, DynNode, ExternalNode, JoinNode, LoopNode, MastForest, MastNode,
11        MastNodeId, OP_GROUP_SIZE, OpBatch, SplitNode,
12    },
13    stack::MIN_STACK_DEPTH,
14    utils::range,
15};
16use miden_debug_types::{DefaultSourceManager, SourceManager};
17
18use crate::{
19    AdviceInputs, AdviceProvider, AsyncHost, ContextId, ErrorContext, ExecutionError, FMP_MIN,
20    ProcessState, SYSCALL_FMP_MIN, add_error_ctx_to_external_error, chiplets::Ace, err_ctx,
21    utils::resolve_external_node_async,
22};
23
24mod memory;
25
26// Ops
27mod circuit_eval;
28mod crypto_ops;
29mod field_ops;
30mod fri_ops;
31mod horner_ops;
32mod io_ops;
33mod stack_ops;
34mod sys_ops;
35mod u32_ops;
36
37#[cfg(test)]
38mod tests;
39
40/// The size of the stack buffer.
41///
42/// Note: This value is much larger than it needs to be for the majority of programs. However, some
43/// existing programs need it (e.g. `std::math::secp256k1::group::gen_mul`), so we're forced to push
44/// it up. At this high a value, we're starting to see some performance degradation on benchmarks.
45/// For example, the blake3 benchmark went from 285 MHz to 250 MHz (~10% degradation). Perhaps a
46/// better solution would be to make this value much smaller (~1000), and then fallback to a `Vec`
47/// if the stack overflows.
48const STACK_BUFFER_SIZE: usize = 6650;
49
50/// The initial position of the top of the stack in the stack buffer.
51///
52/// We place this value close to 0 because if a program hits the limit, it's much more likely to hit
53/// the upper bound than the lower bound, since hitting the lower bound only occurs when you drop
54/// 0's that were generated automatically to keep the stack depth at 16. In practice, if this
55/// occurs, it is most likely a bug.
56const INITIAL_STACK_TOP_IDX: usize = 50;
57
58/// WORD_SIZE, but as a `Felt`.
59const WORD_SIZE_FELT: Felt = Felt::new(4);
60
61/// The size of a double-word.
62const DOUBLE_WORD_SIZE: Felt = Felt::new(8);
63
64/// A fast processor which doesn't generate any trace.
65///
66/// This processor is designed to be as fast as possible. Hence, it only keeps track of the current
67/// state of the processor (i.e. the stack, current clock cycle, current memory context, and free
68/// memory pointer).
69///
70/// # Stack Management
71/// A few key points about how the stack was designed for maximum performance:
72///
73/// - The stack has a fixed buffer size defined by `STACK_BUFFER_SIZE`.
74///     - This was observed to increase performance by at least 2x compared to using a `Vec` with
75///       `push()` & `pop()`.
76///     - We track the stack top and bottom using indices `stack_top_idx` and `stack_bot_idx`,
77///       respectively.
78/// - Since we are using a fixed-size buffer, we need to ensure that stack buffer accesses are not
79///   out of bounds. Naively, we could check for this on every access. However, every operation
80///   alters the stack depth by a predetermined amount, allowing us to precisely determine the
81///   minimum number of operations required to reach a stack buffer boundary, whether at the top or
82///   bottom.
83///     - For example, if the stack top is 10 elements away from the top boundary, and the stack
84///       bottom is 15 elements away from the bottom boundary, then we can safely execute 10
85///       operations that modify the stack depth with no bounds check.
86/// - When switching contexts (e.g., during a call or syscall), all elements past the first 16 are
87///   stored in an `ExecutionContextInfo` struct, and the stack is truncated to 16 elements. This
88///   will be restored when returning from the call or syscall.
89///
90/// # Clock Cycle Management
91/// - The clock cycle (`clk`) is managed in the same way as in `Process`. That is, it is incremented
92///   by 1 for every row that `Process` adds to the main trace.
93///     - It is important to do so because the clock cycle is used to determine the context ID for
94///       new execution contexts when using `call` or `dyncall`.
95/// - When executing a basic block, the clock cycle is not incremented for every individual
96///   operation for performance reasons.
97///     - Rather, we use `clk + operation_index` to determine the clock cycle when needed.
98///     - However this performance improvement is slightly offset by the need to parse operation
99///       batches exactly the same as `Process`. We will be able to recover the performance loss by
100///       redesigning the `BasicBlockNode`.
101#[derive(Debug)]
102pub struct FastProcessor {
103    /// The stack is stored in reverse order, so that the last element is at the top of the stack.
104    pub(super) stack: [Felt; STACK_BUFFER_SIZE],
105    /// The index of the top of the stack.
106    stack_top_idx: usize,
107    /// The index of the bottom of the stack.
108    stack_bot_idx: usize,
109    /// The counter which keeps track of the number of instructions that we can execute without
110    /// hitting the bounds of `stack`.
111    bounds_check_counter: usize,
112
113    /// The current clock cycle.
114    ///
115    /// However, when we are in a basic block, this corresponds to the clock cycle at which the
116    /// basic block was entered. Hence, given an operation, we need to add its index in the
117    /// block to this value to get the clock cycle.
118    pub(super) clk: RowIndex,
119
120    /// The current context ID.
121    pub(super) ctx: ContextId,
122
123    /// The free memory pointer.
124    pub(super) fmp: Felt,
125
126    /// Whether we are currently in a syscall.
127    in_syscall: bool,
128
129    /// The hash of the function that called into the current context, or `[ZERO, ZERO, ZERO,
130    /// ZERO]` if we are in the first context (i.e. when `call_stack` is empty).
131    pub(super) caller_hash: Word,
132
133    /// The advice provider to be used during execution.
134    pub(super) advice: AdviceProvider,
135
136    /// A map from (context_id, word_address) to the word stored starting at that memory location.
137    pub(super) memory: Memory,
138
139    /// A map storing metadata per call to the ACE chiplet.
140    pub(super) ace: Ace,
141
142    /// The call stack is used when starting a new execution context (from a `call`, `syscall` or
143    /// `dyncall`) to keep track of the information needed to return to the previous context upon
144    /// return. It is a stack since calls can be nested.
145    call_stack: Vec<ExecutionContextInfo>,
146
147    /// Whether to enable debug statements and tracing.
148    in_debug_mode: bool,
149
150    /// The source manager (providing information about the location of each instruction).
151    source_manager: Arc<dyn SourceManager>,
152}
153
154impl FastProcessor {
155    // CONSTRUCTORS
156    // ----------------------------------------------------------------------------------------------
157
158    /// Creates a new `FastProcessor` instance with the given stack inputs.
159    ///
160    /// # Panics
161    /// - Panics if the length of `stack_inputs` is greater than `MIN_STACK_DEPTH`.
162    pub fn new(stack_inputs: &[Felt]) -> Self {
163        Self::initialize(stack_inputs, AdviceInputs::default(), false)
164    }
165
166    /// Creates a new `FastProcessor` instance with the given stack and advice inputs.
167    ///
168    /// # Panics
169    /// - Panics if the length of `stack_inputs` is greater than `MIN_STACK_DEPTH`.
170    pub fn new_with_advice_inputs(stack_inputs: &[Felt], advice_inputs: AdviceInputs) -> Self {
171        Self::initialize(stack_inputs, advice_inputs, false)
172    }
173
174    /// Creates a new `FastProcessor` instance, set to debug mode, with the given stack
175    /// and advice inputs.
176    ///
177    /// # Panics
178    /// - Panics if the length of `stack_inputs` is greater than `MIN_STACK_DEPTH`.
179    pub fn new_debug(stack_inputs: &[Felt], advice_inputs: AdviceInputs) -> Self {
180        Self::initialize(stack_inputs, advice_inputs, true)
181    }
182
183    /// Generic constructor unifying the above public ones.
184    ///
185    /// The stack inputs are expected to be stored in reverse order. For example, if `stack_inputs =
186    /// [1,2,3]`, then the stack will be initialized as `[3,2,1,0,0,...]`, with `3` being on
187    /// top.
188    fn initialize(stack_inputs: &[Felt], advice_inputs: AdviceInputs, in_debug_mode: bool) -> Self {
189        assert!(stack_inputs.len() <= MIN_STACK_DEPTH);
190
191        let stack_top_idx = INITIAL_STACK_TOP_IDX;
192        let stack = {
193            let mut stack = [ZERO; STACK_BUFFER_SIZE];
194            let bottom_idx = stack_top_idx - stack_inputs.len();
195
196            stack[bottom_idx..stack_top_idx].copy_from_slice(stack_inputs);
197            stack
198        };
199
200        let stack_bot_idx = stack_top_idx - MIN_STACK_DEPTH;
201
202        let bounds_check_counter = stack_bot_idx;
203        let source_manager = Arc::new(DefaultSourceManager::default());
204        Self {
205            advice: advice_inputs.into(),
206            stack,
207            stack_top_idx,
208            stack_bot_idx,
209            bounds_check_counter,
210            clk: 0_u32.into(),
211            ctx: 0_u32.into(),
212            fmp: Felt::new(FMP_MIN),
213            in_syscall: false,
214            caller_hash: EMPTY_WORD,
215            memory: Memory::new(),
216            call_stack: Vec::new(),
217            ace: Ace::default(),
218            in_debug_mode,
219            source_manager,
220        }
221    }
222
223    /// Set the internal source manager to an externally initialized one.
224    pub fn with_source_manager(mut self, source_manager: Arc<dyn SourceManager>) -> Self {
225        self.source_manager = source_manager;
226        self
227    }
228
229    // ACCESSORS
230    // -------------------------------------------------------------------------------------------
231
232    /// Returns the stack, such that the top of the stack is at the last index of the returned
233    /// slice.
234    pub fn stack(&self) -> &[Felt] {
235        &self.stack[self.stack_bot_idx..self.stack_top_idx]
236    }
237
238    /// Returns the element on the stack at index `idx`.
239    #[inline(always)]
240    pub fn stack_get(&self, idx: usize) -> Felt {
241        self.stack[self.stack_top_idx - idx - 1]
242    }
243
244    /// Mutable variant of `stack_get()`.
245    #[inline(always)]
246    pub fn stack_get_mut(&mut self, idx: usize) -> &mut Felt {
247        &mut self.stack[self.stack_top_idx - idx - 1]
248    }
249
250    /// Returns the word on the stack starting at index `start_idx` in "stack order".
251    ///
252    /// That is, for `start_idx=0` the top element of the stack will be at the last position in the
253    /// word.
254    ///
255    /// For example, if the stack looks like this:
256    ///
257    /// top                                                       bottom
258    /// v                                                           v
259    /// a | b | c | d | e | f | g | h | i | j | k | l | m | n | o | p
260    ///
261    /// Then
262    /// - `stack_get_word(0)` returns `[d, c, b, a]`,
263    /// - `stack_get_word(1)` returns `[e, d, c ,b]`,
264    /// - etc.
265    #[inline(always)]
266    pub fn stack_get_word(&self, start_idx: usize) -> Word {
267        debug_assert!(start_idx < MIN_STACK_DEPTH);
268
269        let word_start_idx = self.stack_top_idx - start_idx - 4;
270        let result: [Felt; WORD_SIZE] =
271            self.stack[range(word_start_idx, WORD_SIZE)].try_into().unwrap();
272        result.into()
273    }
274
275    /// Returns the number of elements on the stack in the current context.
276    #[inline(always)]
277    pub fn stack_depth(&self) -> u32 {
278        (self.stack_top_idx - self.stack_bot_idx) as u32
279    }
280
281    // MUTATORS
282    // -------------------------------------------------------------------------------------------
283
284    /// Writes an element to the stack at the given index.
285    #[inline(always)]
286    pub fn stack_write(&mut self, idx: usize, element: Felt) {
287        self.stack[self.stack_top_idx - idx - 1] = element
288    }
289
290    /// Writes a word to the stack starting at the given index.
291    ///
292    /// The index is the index of the first element of the word, and the word is written in reverse
293    /// order.
294    #[inline(always)]
295    pub fn stack_write_word(&mut self, start_idx: usize, word: &Word) {
296        debug_assert!(start_idx < MIN_STACK_DEPTH);
297
298        let word_start_idx = self.stack_top_idx - start_idx - 4;
299        let source: [Felt; WORD_SIZE] = (*word).into();
300        self.stack[range(word_start_idx, WORD_SIZE)].copy_from_slice(&source)
301    }
302
303    /// Swaps the elements at the given indices on the stack.
304    #[inline(always)]
305    pub fn stack_swap(&mut self, idx1: usize, idx2: usize) {
306        let a = self.stack_get(idx1);
307        let b = self.stack_get(idx2);
308        self.stack_write(idx1, b);
309        self.stack_write(idx2, a);
310    }
311
312    // EXECUTE
313    // -------------------------------------------------------------------------------------------
314
315    /// Executes the given program and returns the stack outputs.
316    pub async fn execute(
317        mut self,
318        program: &Program,
319        host: &mut impl AsyncHost,
320    ) -> Result<StackOutputs, ExecutionError> {
321        self.execute_impl(program, host).await
322    }
323
324    /// Executes the given program and returns the stack outputs.
325    ///
326    /// This function is mainly split out of `execute()` for testing purposes so that we can access
327    /// the internal state of the `FastProcessor` after execution.
328    async fn execute_impl(
329        &mut self,
330        program: &Program,
331        host: &mut impl AsyncHost,
332    ) -> Result<StackOutputs, ExecutionError> {
333        self.execute_mast_node(program.entrypoint(), program.mast_forest(), program.kernel(), host)
334            .await?;
335
336        StackOutputs::new(
337            self.stack[self.stack_bot_idx..self.stack_top_idx]
338                .iter()
339                .rev()
340                .copied()
341                .collect(),
342        )
343        .map_err(|_| {
344            ExecutionError::OutputStackOverflow(
345                self.stack_top_idx - self.stack_bot_idx - MIN_STACK_DEPTH,
346            )
347        })
348    }
349
350    // NODE EXECUTORS
351    // --------------------------------------------------------------------------------------------
352
353    async fn execute_mast_node(
354        &mut self,
355        node_id: MastNodeId,
356        program: &MastForest,
357        kernel: &Kernel,
358        host: &mut impl AsyncHost,
359    ) -> Result<(), ExecutionError> {
360        let node = program
361            .get_node_by_id(node_id)
362            .ok_or(ExecutionError::MastNodeNotFoundInForest { node_id })?;
363
364        // Note: we only run this in case there are Trace events associated with the node. However,
365        // if there are assembly ops in the "before enter" list, we will cycle through them and
366        // ignore them, resulting in a drop of performance. We should remove this after trace events
367        // are removed from decorators - or if decorators are removed entirely.
368        //
369        // A similar reasoning applies to the "after exit" list.
370        for &decorator_id in node.before_enter() {
371            self.execute_decorator(&program[decorator_id], 0, host)?;
372        }
373
374        match node {
375            MastNode::Block(basic_block_node) => {
376                self.execute_basic_block_node(basic_block_node, program, host).await?
377            },
378            MastNode::Join(join_node) => {
379                Box::pin(self.execute_join_node(join_node, program, kernel, host)).await?
380            },
381            MastNode::Split(split_node) => {
382                Box::pin(self.execute_split_node(split_node, program, kernel, host)).await?
383            },
384            MastNode::Loop(loop_node) => {
385                Box::pin(self.execute_loop_node(loop_node, program, kernel, host)).await?
386            },
387            MastNode::Call(call_node) => {
388                let err_ctx = err_ctx!(program, call_node, self.source_manager.clone());
389                add_error_ctx_to_external_error(
390                    Box::pin(self.execute_call_node(call_node, program, kernel, host)).await,
391                    err_ctx,
392                )?
393            },
394            MastNode::Dyn(dyn_node) => {
395                let err_ctx = err_ctx!(program, dyn_node, self.source_manager.clone());
396                add_error_ctx_to_external_error(
397                    Box::pin(self.execute_dyn_node(dyn_node, program, kernel, host)).await,
398                    err_ctx,
399                )?
400            },
401            MastNode::External(external_node) => {
402                Box::pin(self.execute_external_node(external_node, kernel, host)).await?
403            },
404        }
405
406        for &decorator_id in node.after_exit() {
407            self.execute_decorator(&program[decorator_id], 0, host)?;
408        }
409
410        Ok(())
411    }
412
413    async fn execute_join_node(
414        &mut self,
415        join_node: &JoinNode,
416        program: &MastForest,
417        kernel: &Kernel,
418        host: &mut impl AsyncHost,
419    ) -> Result<(), ExecutionError> {
420        // Corresponds to the row inserted for the JOIN operation added to the trace.
421        self.clk += 1_u32;
422
423        self.execute_mast_node(join_node.first(), program, kernel, host).await?;
424        self.execute_mast_node(join_node.second(), program, kernel, host).await?;
425
426        // Corresponds to the row inserted for the END operation added to the trace.
427        self.clk += 1_u32;
428
429        Ok(())
430    }
431
432    async fn execute_split_node(
433        &mut self,
434        split_node: &SplitNode,
435        program: &MastForest,
436        kernel: &Kernel,
437        host: &mut impl AsyncHost,
438    ) -> Result<(), ExecutionError> {
439        // Corresponds to the row inserted for the SPLIT operation added to the trace.
440        self.clk += 1_u32;
441
442        let condition = self.stack_get(0);
443
444        // drop the condition from the stack
445        self.decrement_stack_size();
446
447        // execute the appropriate branch
448        let ret = if condition == ONE {
449            self.execute_mast_node(split_node.on_true(), program, kernel, host).await
450        } else if condition == ZERO {
451            self.execute_mast_node(split_node.on_false(), program, kernel, host).await
452        } else {
453            let err_ctx = err_ctx!(program, split_node, self.source_manager.clone());
454            Err(ExecutionError::not_binary_value_if(condition, &err_ctx))
455        };
456
457        // Corresponds to the row inserted for the END operation added to the trace.
458        self.clk += 1_u32;
459
460        ret
461    }
462
463    async fn execute_loop_node(
464        &mut self,
465        loop_node: &LoopNode,
466        program: &MastForest,
467        kernel: &Kernel,
468        host: &mut impl AsyncHost,
469    ) -> Result<(), ExecutionError> {
470        // Corresponds to the row inserted for the LOOP operation added to the trace.
471        self.clk += 1_u32;
472
473        // The loop condition is checked after the loop body is executed.
474        let mut condition = self.stack_get(0);
475
476        // drop the condition from the stack
477        self.decrement_stack_size();
478
479        // execute the loop body as long as the condition is true
480        while condition == ONE {
481            self.execute_mast_node(loop_node.body(), program, kernel, host).await?;
482
483            // check the loop condition, and drop it from the stack
484            condition = self.stack_get(0);
485            self.decrement_stack_size();
486
487            // this clock increment is for the row inserted for the `REPEAT` node added to
488            // the trace on each iteration. It needs to be at the end of this loop (instead
489            // of at the beginning), otherwise we get an off-by-one error when comparing
490            // with [crate::Process].
491            if condition == ONE {
492                self.clk += 1_u32;
493            }
494        }
495
496        // Corresponds to the row inserted for the END operation added to the trace.
497        self.clk += 1_u32;
498
499        if condition == ZERO {
500            Ok(())
501        } else {
502            let err_ctx = err_ctx!(program, loop_node, self.source_manager.clone());
503            Err(ExecutionError::not_binary_value_loop(condition, &err_ctx))
504        }
505    }
506
507    async fn execute_call_node(
508        &mut self,
509        call_node: &CallNode,
510        program: &MastForest,
511        kernel: &Kernel,
512        host: &mut impl AsyncHost,
513    ) -> Result<(), ExecutionError> {
514        let err_ctx = err_ctx!(program, call_node, self.source_manager.clone());
515
516        // Corresponds to the row inserted for the CALL or SYSCALL operation added to the trace.
517        self.clk += 1_u32;
518
519        // call or syscall are not allowed inside a syscall
520        if self.in_syscall {
521            let instruction = if call_node.is_syscall() { "syscall" } else { "call" };
522            return Err(ExecutionError::CallInSyscall(instruction));
523        }
524
525        let callee_hash = program
526            .get_node_by_id(call_node.callee())
527            .ok_or(ExecutionError::MastNodeNotFoundInForest { node_id: call_node.callee() })?
528            .digest();
529
530        self.save_context_and_truncate_stack();
531
532        if call_node.is_syscall() {
533            // check if the callee is in the kernel
534            if !kernel.contains_proc(callee_hash) {
535                return Err(ExecutionError::syscall_target_not_in_kernel(callee_hash, &err_ctx));
536            }
537
538            // set the system registers to the syscall context
539            self.ctx = ContextId::root();
540            self.fmp = SYSCALL_FMP_MIN.into();
541            self.in_syscall = true;
542        } else {
543            // set the system registers to the callee context
544            self.ctx = self.clk.into();
545            self.fmp = Felt::new(FMP_MIN);
546            self.caller_hash = callee_hash;
547        }
548
549        // Execute the callee.
550        self.execute_mast_node(call_node.callee(), program, kernel, host).await?;
551
552        // when returning from a function call or a syscall, restore the context of the
553        // system registers and the operand stack to what it was prior to
554        // the call.
555        self.restore_context(&err_ctx)?;
556
557        // Corresponds to the row inserted for the END operation added to the trace.
558        self.clk += 1_u32;
559
560        Ok(())
561    }
562
563    async fn execute_dyn_node(
564        &mut self,
565        dyn_node: &DynNode,
566        program: &MastForest,
567        kernel: &Kernel,
568        host: &mut impl AsyncHost,
569    ) -> Result<(), ExecutionError> {
570        // Corresponds to the row inserted for the DYN or DYNCALL operation added to the trace.
571        self.clk += 1_u32;
572
573        // dyn calls are not allowed inside a syscall
574        if dyn_node.is_dyncall() && self.in_syscall {
575            return Err(ExecutionError::CallInSyscall("dyncall"));
576        }
577
578        let err_ctx = err_ctx!(program, dyn_node, self.source_manager.clone());
579
580        // Retrieve callee hash from memory, using stack top as the memory address.
581        let callee_hash = {
582            let mem_addr = self.stack_get(0);
583            self.memory
584                .read_word(self.ctx, mem_addr, self.clk, &err_ctx)
585                .map_err(ExecutionError::MemoryError)?
586        };
587
588        // Drop the memory address from the stack. This needs to be done BEFORE saving the context,
589        // because the next instruction starts with a "shifted left" stack.
590        self.decrement_stack_size();
591
592        // For dyncall, save the context and reset it.
593        if dyn_node.is_dyncall() {
594            self.save_context_and_truncate_stack();
595            self.ctx = self.clk.into();
596            self.fmp = Felt::new(FMP_MIN);
597            self.caller_hash = callee_hash;
598        };
599
600        // if the callee is not in the program's MAST forest, try to find a MAST forest for it in
601        // the host (corresponding to an external library loaded in the host); if none are
602        // found, return an error.
603        match program.find_procedure_root(callee_hash) {
604            Some(callee_id) => self.execute_mast_node(callee_id, program, kernel, host).await?,
605            None => {
606                let mast_forest = host
607                    .get_mast_forest(&callee_hash)
608                    .await
609                    .ok_or_else(|| ExecutionError::dynamic_node_not_found(callee_hash, &err_ctx))?;
610
611                // We limit the parts of the program that can be called externally to procedure
612                // roots, even though MAST doesn't have that restriction.
613                let root_id = mast_forest
614                    .find_procedure_root(callee_hash)
615                    .ok_or(ExecutionError::malfored_mast_forest_in_host(callee_hash, &err_ctx))?;
616
617                self.execute_mast_node(root_id, &mast_forest, kernel, host).await?
618            },
619        }
620
621        // For dyncall, restore the context.
622        if dyn_node.is_dyncall() {
623            self.restore_context(&err_ctx)?;
624        }
625
626        // Corresponds to the row inserted for the END operation added to the trace.
627        self.clk += 1_u32;
628
629        Ok(())
630    }
631
632    async fn execute_external_node(
633        &mut self,
634        external_node: &ExternalNode,
635        kernel: &Kernel,
636        host: &mut impl AsyncHost,
637    ) -> Result<(), ExecutionError> {
638        let (root_id, mast_forest) =
639            resolve_external_node_async(external_node, &mut self.advice, host).await?;
640
641        self.execute_mast_node(root_id, &mast_forest, kernel, host).await
642    }
643
644    // Note: when executing individual ops, we do not increment the clock by 1 at every iteration
645    // for performance reasons (~25% performance drop). Hence, `self.clk` cannot be used directly to
646    // determine the number of operations executed in a program.
647    #[inline(always)]
648    async fn execute_basic_block_node(
649        &mut self,
650        basic_block_node: &BasicBlockNode,
651        program: &MastForest,
652        host: &mut impl AsyncHost,
653    ) -> Result<(), ExecutionError> {
654        // Corresponds to the row inserted for the SPAN operation added to the trace.
655        self.clk += 1_u32;
656
657        let mut batch_offset_in_block = 0;
658        let mut op_batches = basic_block_node.op_batches().iter();
659        let mut decorator_ids = basic_block_node.decorator_iter();
660
661        // execute first op batch
662        if let Some(first_op_batch) = op_batches.next() {
663            self.execute_op_batch(
664                basic_block_node,
665                first_op_batch,
666                &mut decorator_ids,
667                batch_offset_in_block,
668                program,
669                host,
670            )
671            .await?;
672            batch_offset_in_block += first_op_batch.ops().len();
673        }
674
675        // execute the rest of the op batches
676        for op_batch in op_batches {
677            // increment clock to account for `RESPAN`
678            self.clk += 1_u32;
679
680            self.execute_op_batch(
681                basic_block_node,
682                op_batch,
683                &mut decorator_ids,
684                batch_offset_in_block,
685                program,
686                host,
687            )
688            .await?;
689            batch_offset_in_block += op_batch.ops().len();
690        }
691
692        // update clock with all the operations that executed
693        self.clk += batch_offset_in_block as u32;
694
695        // Corresponds to the row inserted for the END operation added to the trace.
696        self.clk += 1_u32;
697
698        // execute any decorators which have not been executed during span ops execution; this can
699        // happen for decorators appearing after all operations in a block. these decorators are
700        // executed after SPAN block is closed to make sure the VM clock cycle advances beyond the
701        // last clock cycle of the SPAN block ops.
702        for &decorator_id in decorator_ids {
703            let decorator = program
704                .get_decorator_by_id(decorator_id)
705                .ok_or(ExecutionError::DecoratorNotFoundInForest { decorator_id })?;
706            self.execute_decorator(decorator, 0, host)?;
707        }
708
709        Ok(())
710    }
711
712    #[inline(always)]
713    async fn execute_op_batch(
714        &mut self,
715        basic_block: &BasicBlockNode,
716        batch: &OpBatch,
717        decorators: &mut DecoratorIterator<'_>,
718        batch_offset_in_block: usize,
719        program: &MastForest,
720        host: &mut impl AsyncHost,
721    ) -> Result<(), ExecutionError> {
722        let op_counts = batch.op_counts();
723        let mut op_idx_in_group = 0;
724        let mut group_idx = 0;
725        let mut next_group_idx = 1;
726
727        // round up the number of groups to be processed to the next power of two; we do this
728        // because the processor requires the number of groups to be either 1, 2, 4, or 8; if
729        // the actual number of groups is smaller, we'll pad the batch with NOOPs at the end
730        let num_batch_groups = batch.num_groups().next_power_of_two();
731
732        // execute operations in the batch one by one
733        for (op_idx_in_batch, op) in batch.ops().iter().enumerate() {
734            while let Some(&decorator_id) =
735                decorators.next_filtered(batch_offset_in_block + op_idx_in_batch)
736            {
737                let decorator = program
738                    .get_decorator_by_id(decorator_id)
739                    .ok_or(ExecutionError::DecoratorNotFoundInForest { decorator_id })?;
740                self.execute_decorator(decorator, op_idx_in_batch, host)?;
741            }
742
743            // decode and execute the operation
744            let op_idx_in_block = batch_offset_in_block + op_idx_in_batch;
745            let err_ctx =
746                err_ctx!(program, basic_block, self.source_manager.clone(), op_idx_in_block);
747
748            // Execute the operation.
749            //
750            // Note: we handle the `Emit` operation separately, because it is an async operation,
751            // whereas all the other operations are synchronous (resulting in a significant
752            // performance improvement).
753            match op {
754                Operation::Emit(event_id) => {
755                    self.op_emit(*event_id, op_idx_in_block, host, &err_ctx).await?
756                },
757                _ => {
758                    // if the operation is not an Emit, we execute it normally
759                    self.execute_op(op, op_idx_in_block, program, host, &err_ctx)?;
760                },
761            }
762
763            // if the operation carries an immediate value, the value is stored at the next group
764            // pointer; so, we advance the pointer to the following group
765            let has_imm = op.imm_value().is_some();
766            if has_imm {
767                next_group_idx += 1;
768            }
769
770            // determine if we've executed all non-decorator operations in a group
771            if op_idx_in_group == op_counts[group_idx] - 1 {
772                // if we are at the end of the group, first check if the operation carries an
773                // immediate value
774                if has_imm {
775                    // an operation with an immediate value cannot be the last operation in a group
776                    // so, we need execute a NOOP after it. In this processor, we increment the
777                    // clock to account for the NOOP.
778                    debug_assert!(op_idx_in_group < OP_GROUP_SIZE - 1, "invalid op index");
779                    self.clk += 1_u32;
780                }
781
782                // then, move to the next group and reset operation index
783                group_idx = next_group_idx;
784                next_group_idx += 1;
785                op_idx_in_group = 0;
786            } else {
787                op_idx_in_group += 1;
788            }
789        }
790
791        // make sure we execute the required number of operation groups; this would happen when the
792        // actual number of operation groups was not a power of two. In this processor, this
793        // corresponds to incrementing the clock by the number of empty op groups (i.e. 1 NOOP
794        // executed per missing op group).
795
796        self.clk += (num_batch_groups - group_idx) as u32;
797
798        Ok(())
799    }
800
801    /// Executes the specified decorator
802    fn execute_decorator(
803        &mut self,
804        decorator: &Decorator,
805        op_idx_in_batch: usize,
806        host: &mut impl AsyncHost,
807    ) -> Result<(), ExecutionError> {
808        match decorator {
809            Decorator::Debug(options) => {
810                if self.in_debug_mode {
811                    let process = &mut self.state(op_idx_in_batch);
812                    host.on_debug(process, options)?;
813                }
814            },
815            Decorator::AsmOp(_assembly_op) => {
816                // do nothing
817            },
818            Decorator::Trace(id) => {
819                let process = &mut self.state(op_idx_in_batch);
820                host.on_trace(process, *id)?;
821            },
822        };
823        Ok(())
824    }
825
826    /// Executes the given operation.
827    ///
828    /// # Panics
829    /// - if the operation is a control flow operation, as these are never executed,
830    /// - if the operation is an `Emit` operation, as this requires async execution.
831    #[inline(always)]
832    fn execute_op(
833        &mut self,
834        operation: &Operation,
835        op_idx: usize,
836        program: &MastForest,
837        host: &mut impl AsyncHost,
838        err_ctx: &impl ErrorContext,
839    ) -> Result<(), ExecutionError> {
840        if self.bounds_check_counter == 0 {
841            let err_str = if self.stack_top_idx - MIN_STACK_DEPTH == 0 {
842                "stack underflow"
843            } else {
844                "stack overflow"
845            };
846            return Err(ExecutionError::FailedToExecuteProgram(err_str));
847        }
848
849        match operation {
850            // ----- system operations ------------------------------------------------------------
851            Operation::Noop => {
852                // do nothing
853            },
854            Operation::Assert(err_code) => {
855                self.op_assert(*err_code, op_idx, host, program, err_ctx)?
856            },
857            Operation::FmpAdd => self.op_fmpadd(),
858            Operation::FmpUpdate => self.op_fmpupdate()?,
859            Operation::SDepth => self.op_sdepth(),
860            Operation::Caller => self.op_caller()?,
861            Operation::Clk => self.op_clk(op_idx)?,
862            Operation::Emit(_event_id) => {
863                panic!("emit instruction requires async, so is not supported by execute_op()")
864            },
865
866            // ----- flow control operations ------------------------------------------------------
867            // control flow operations are never executed directly
868            Operation::Join => unreachable!("control flow operation"),
869            Operation::Split => unreachable!("control flow operation"),
870            Operation::Loop => unreachable!("control flow operation"),
871            Operation::Call => unreachable!("control flow operation"),
872            Operation::SysCall => unreachable!("control flow operation"),
873            Operation::Dyn => unreachable!("control flow operation"),
874            Operation::Dyncall => unreachable!("control flow operation"),
875            Operation::Span => unreachable!("control flow operation"),
876            Operation::Repeat => unreachable!("control flow operation"),
877            Operation::Respan => unreachable!("control flow operation"),
878            Operation::End => unreachable!("control flow operation"),
879            Operation::Halt => unreachable!("control flow operation"),
880
881            // ----- field operations -------------------------------------------------------------
882            Operation::Add => self.op_add()?,
883            Operation::Neg => self.op_neg()?,
884            Operation::Mul => self.op_mul()?,
885            Operation::Inv => self.op_inv(op_idx, err_ctx)?,
886            Operation::Incr => self.op_incr()?,
887            Operation::And => self.op_and(err_ctx)?,
888            Operation::Or => self.op_or(err_ctx)?,
889            Operation::Not => self.op_not(err_ctx)?,
890            Operation::Eq => self.op_eq()?,
891            Operation::Eqz => self.op_eqz()?,
892            Operation::Expacc => self.op_expacc(),
893            Operation::Ext2Mul => self.op_ext2mul(),
894
895            // ----- u32 operations ---------------------------------------------------------------
896            Operation::U32split => self.op_u32split(),
897            Operation::U32add => self.op_u32add(err_ctx)?,
898            Operation::U32add3 => self.op_u32add3(err_ctx)?,
899            Operation::U32sub => self.op_u32sub(op_idx, err_ctx)?,
900            Operation::U32mul => self.op_u32mul(err_ctx)?,
901            Operation::U32madd => self.op_u32madd(err_ctx)?,
902            Operation::U32div => self.op_u32div(op_idx, err_ctx)?,
903            Operation::U32and => self.op_u32and(err_ctx)?,
904            Operation::U32xor => self.op_u32xor(err_ctx)?,
905            Operation::U32assert2(err_code) => self.op_u32assert2(*err_code, err_ctx)?,
906
907            // ----- stack manipulation -----------------------------------------------------------
908            Operation::Pad => self.op_pad(),
909            Operation::Drop => self.decrement_stack_size(),
910            Operation::Dup0 => self.dup_nth(0),
911            Operation::Dup1 => self.dup_nth(1),
912            Operation::Dup2 => self.dup_nth(2),
913            Operation::Dup3 => self.dup_nth(3),
914            Operation::Dup4 => self.dup_nth(4),
915            Operation::Dup5 => self.dup_nth(5),
916            Operation::Dup6 => self.dup_nth(6),
917            Operation::Dup7 => self.dup_nth(7),
918            Operation::Dup9 => self.dup_nth(9),
919            Operation::Dup11 => self.dup_nth(11),
920            Operation::Dup13 => self.dup_nth(13),
921            Operation::Dup15 => self.dup_nth(15),
922            Operation::Swap => self.op_swap(),
923            Operation::SwapW => self.swapw_nth(1),
924            Operation::SwapW2 => self.swapw_nth(2),
925            Operation::SwapW3 => self.swapw_nth(3),
926            Operation::SwapDW => self.op_swap_double_word(),
927            Operation::MovUp2 => self.rotate_left(3),
928            Operation::MovUp3 => self.rotate_left(4),
929            Operation::MovUp4 => self.rotate_left(5),
930            Operation::MovUp5 => self.rotate_left(6),
931            Operation::MovUp6 => self.rotate_left(7),
932            Operation::MovUp7 => self.rotate_left(8),
933            Operation::MovUp8 => self.rotate_left(9),
934            Operation::MovDn2 => self.rotate_right(3),
935            Operation::MovDn3 => self.rotate_right(4),
936            Operation::MovDn4 => self.rotate_right(5),
937            Operation::MovDn5 => self.rotate_right(6),
938            Operation::MovDn6 => self.rotate_right(7),
939            Operation::MovDn7 => self.rotate_right(8),
940            Operation::MovDn8 => self.rotate_right(9),
941            Operation::CSwap => self.op_cswap(err_ctx)?,
942            Operation::CSwapW => self.op_cswapw(err_ctx)?,
943
944            // ----- input / output ---------------------------------------------------------------
945            Operation::Push(element) => self.op_push(*element),
946            Operation::AdvPop => self.op_advpop(op_idx, err_ctx)?,
947            Operation::AdvPopW => self.op_advpopw(op_idx, err_ctx)?,
948            Operation::MLoadW => self.op_mloadw(op_idx, err_ctx)?,
949            Operation::MStoreW => self.op_mstorew(op_idx, err_ctx)?,
950            Operation::MLoad => self.op_mload(err_ctx)?,
951            Operation::MStore => self.op_mstore(err_ctx)?,
952            Operation::MStream => self.op_mstream(op_idx, err_ctx)?,
953            Operation::Pipe => self.op_pipe(op_idx, err_ctx)?,
954
955            // ----- cryptographic operations -----------------------------------------------------
956            Operation::HPerm => self.op_hperm(),
957            Operation::MpVerify(err_code) => self.op_mpverify(*err_code, program, err_ctx)?,
958            Operation::MrUpdate => self.op_mrupdate(err_ctx)?,
959            Operation::FriE2F4 => self.op_fri_ext2fold4()?,
960            Operation::HornerBase => self.op_horner_eval_base(op_idx, err_ctx)?,
961            Operation::HornerExt => self.op_horner_eval_ext(op_idx, err_ctx)?,
962            Operation::ArithmeticCircuitEval => self.arithmetic_circuit_eval(op_idx, err_ctx)?,
963        }
964
965        Ok(())
966    }
967
968    // HELPERS
969    // ----------------------------------------------------------------------------------------------
970
971    /// Increments the stack top pointer by 1.
972    ///
973    /// The bottom of the stack is never affected by this operation.
974    #[inline(always)]
975    fn increment_stack_size(&mut self) {
976        self.stack_top_idx += 1;
977        self.update_bounds_check_counter();
978    }
979
980    /// Decrements the stack top pointer by 1.
981    ///
982    /// The bottom of the stack is only decremented in cases where the stack depth would become less
983    /// than 16.
984    #[inline(always)]
985    fn decrement_stack_size(&mut self) {
986        self.stack_top_idx -= 1;
987        self.stack_bot_idx = min(self.stack_bot_idx, self.stack_top_idx - MIN_STACK_DEPTH);
988        self.update_bounds_check_counter();
989    }
990
991    /// Returns the size of the stack.
992    #[inline(always)]
993    fn stack_size(&self) -> usize {
994        self.stack_top_idx - self.stack_bot_idx
995    }
996
997    /// Updates the bounds check counter.
998    ///
999    /// The bounds check counter is decremented by 1. If it reaches 0, it is reset to the minimum of
1000    /// the stack depth from the low end and the high end of the stack buffer.
1001    ///
1002    /// The purpose of the bounds check counter is to ensure that we never access the stack buffer
1003    /// at an out-of-bounds index.
1004    #[inline(always)]
1005    fn update_bounds_check_counter(&mut self) {
1006        self.bounds_check_counter -= 1;
1007
1008        if self.bounds_check_counter == 0 {
1009            // We will need to check the bounds either because we reach the low end or the high end
1010            // of the stack buffer. There are two worst cases that we are concerned about:
1011            // - we only execute instructions that decrease stack depth
1012            // - we only execute instructions that increase stack depth
1013            //
1014            // In the first case, we will hit the low end of the stack buffer; in the second case,
1015            // we will hit the high end of the stack buffer. We set the number of instructions that
1016            // is safe to execute to be the minimum of these two worst cases.
1017
1018            self.bounds_check_counter =
1019                min(self.stack_top_idx - MIN_STACK_DEPTH, STACK_BUFFER_SIZE - self.stack_top_idx);
1020        }
1021    }
1022
1023    /// Saves the current execution context and truncates the stack to 16 elements in preparation to
1024    /// start a new execution context.
1025    fn save_context_and_truncate_stack(&mut self) {
1026        let overflow_stack = if self.stack_size() > MIN_STACK_DEPTH {
1027            // save the overflow stack, and zero out the buffer.
1028            //
1029            // Note: we need to zero the overflow buffer, since the new context expects ZERO's to be
1030            // pulled in if they decrement the stack size (e.g. by executing a `drop`).
1031            let overflow_stack =
1032                self.stack[self.stack_bot_idx..self.stack_top_idx - MIN_STACK_DEPTH].to_vec();
1033            self.stack[self.stack_bot_idx..self.stack_top_idx - MIN_STACK_DEPTH].fill(ZERO);
1034
1035            overflow_stack
1036        } else {
1037            Vec::new()
1038        };
1039
1040        self.stack_bot_idx = self.stack_top_idx - MIN_STACK_DEPTH;
1041
1042        self.call_stack.push(ExecutionContextInfo {
1043            overflow_stack,
1044            ctx: self.ctx,
1045            fn_hash: self.caller_hash,
1046            fmp: self.fmp,
1047        });
1048    }
1049
1050    /// Restores the execution context to the state it was in before the last `call`, `syscall` or
1051    /// `dyncall`.
1052    ///
1053    /// This includes restoring the overflow stack and the system parameters.
1054    ///
1055    /// # Errors
1056    /// - Returns an error if the overflow stack is larger than the space available in the stack
1057    ///   buffer.
1058    fn restore_context(&mut self, err_ctx: &impl ErrorContext) -> Result<(), ExecutionError> {
1059        // when a call/dyncall/syscall node ends, stack depth must be exactly 16.
1060        if self.stack_size() > MIN_STACK_DEPTH {
1061            return Err(ExecutionError::invalid_stack_depth_on_return(self.stack_size(), err_ctx));
1062        }
1063
1064        let ctx_info = self
1065            .call_stack
1066            .pop()
1067            .expect("execution context stack should never be empty when restoring context");
1068
1069        // restore the overflow stack
1070        {
1071            let overflow_len = ctx_info.overflow_stack.len();
1072            if overflow_len > self.stack_bot_idx {
1073                return Err(ExecutionError::FailedToExecuteProgram(
1074                    "stack underflow when restoring context",
1075                ));
1076            }
1077
1078            self.stack[range(self.stack_bot_idx - overflow_len, overflow_len)]
1079                .copy_from_slice(&ctx_info.overflow_stack);
1080            self.stack_bot_idx -= overflow_len;
1081        }
1082
1083        // restore system parameters
1084        self.ctx = ctx_info.ctx;
1085        self.fmp = ctx_info.fmp;
1086        self.in_syscall = false;
1087        self.caller_hash = ctx_info.fn_hash;
1088
1089        Ok(())
1090    }
1091
1092    // TESTING
1093    // ----------------------------------------------------------------------------------------------
1094
1095    /// Convenience sync wrapper to [Self::execute] for testing purposes.
1096    #[cfg(any(test, feature = "testing"))]
1097    pub fn execute_sync(
1098        self,
1099        program: &Program,
1100        host: &mut impl AsyncHost,
1101    ) -> Result<StackOutputs, ExecutionError> {
1102        // Create a new Tokio runtime and block on the async execution
1103        let rt = tokio::runtime::Builder::new_current_thread().build().unwrap();
1104
1105        rt.block_on(self.execute(program, host))
1106    }
1107
1108    /// Similar to [Self::execute_sync], but allows mutable access to the processor.
1109    #[cfg(any(test, feature = "testing"))]
1110    pub fn execute_sync_mut(
1111        &mut self,
1112        program: &Program,
1113        host: &mut impl AsyncHost,
1114    ) -> Result<StackOutputs, ExecutionError> {
1115        // Create a new Tokio runtime and block on the async execution
1116        let rt = tokio::runtime::Builder::new_current_thread().build().unwrap();
1117
1118        rt.block_on(self.execute_impl(program, host))
1119    }
1120}
1121
1122#[derive(Debug)]
1123pub struct FastProcessState<'a> {
1124    pub(super) processor: &'a mut FastProcessor,
1125    /// the index of the operation in its basic block
1126    pub(super) op_idx: usize,
1127}
1128
1129impl FastProcessor {
1130    #[inline(always)]
1131    pub fn state(&mut self, op_idx: usize) -> ProcessState<'_> {
1132        ProcessState::Fast(FastProcessState { processor: self, op_idx })
1133    }
1134}
1135
1136// EXECUTION CONTEXT INFO
1137// ===============================================================================================
1138
1139/// Information about the execution context.
1140///
1141/// This struct is used to keep track of the information needed to return to the previous context
1142/// upon return from a `call`, `syscall` or `dyncall`.
1143#[derive(Debug)]
1144struct ExecutionContextInfo {
1145    /// This stores all the elements on the stack at the call site, excluding the top 16 elements.
1146    /// This corresponds to the overflow table in [crate::Process].
1147    overflow_stack: Vec<Felt>,
1148    ctx: ContextId,
1149    fn_hash: Word,
1150    fmp: Felt,
1151}