miden_processor/fast/
mod.rs

1use alloc::vec::Vec;
2use core::cmp::min;
3
4use memory::Memory;
5use miden_air::RowIndex;
6use vm_core::{
7    Decorator, DecoratorIterator, EMPTY_WORD, Felt, Kernel, ONE, Operation, Program, StackOutputs,
8    WORD_SIZE, Word, ZERO,
9    mast::{
10        BasicBlockNode, CallNode, DynNode, ExternalNode, JoinNode, LoopNode, MastForest, MastNode,
11        MastNodeId, OP_GROUP_SIZE, OpBatch, SplitNode,
12    },
13    stack::MIN_STACK_DEPTH,
14    utils::range,
15};
16
17use crate::{
18    ContextId, ExecutionError, FMP_MIN, Host, ProcessState, SYSCALL_FMP_MIN, chiplets::Ace,
19    errors::ErrorContext, utils::resolve_external_node,
20};
21
22mod memory;
23
24// Ops
25mod circuit_eval;
26mod crypto_ops;
27mod field_ops;
28mod fri_ops;
29mod horner_ops;
30mod io_ops;
31mod stack_ops;
32mod sys_ops;
33mod u32_ops;
34
35#[cfg(test)]
36mod tests;
37
38/// The size of the stack buffer.
39///
40/// Note: This value is much larger than it needs to be for the majority of programs. However, some
41/// existing programs need it (e.g. `std::math::secp256k1::group::gen_mul`), so we're forced to push
42/// it up. At this high a value, we're starting to see some performance degradation on benchmarks.
43/// For example, the blake3 benchmark went from 285 MHz to 250 MHz (~10% degradation). Perhaps a
44/// better solution would be to make this value much smaller (~1000), and then fallback to a `Vec`
45/// if the stack overflows.
46const STACK_BUFFER_SIZE: usize = 6650;
47
48/// The initial position of the top of the stack in the stack buffer.
49///
50/// We place this value close to 0 because if a program hits the limit, it's much more likely to hit
51/// the upper bound than the lower bound, since hitting the lower bound only occurs when you drop
52/// 0's that were generated automatically to keep the stack depth at 16. In practice, if this
53/// occurs, it is most likely a bug.
54const INITIAL_STACK_TOP_IDX: usize = 50;
55
56/// WORD_SIZE, but as a `Felt`.
57const WORD_SIZE_FELT: Felt = Felt::new(4);
58
59/// The size of a double-word.
60const DOUBLE_WORD_SIZE: Felt = Felt::new(8);
61
62/// A fast processor which doesn't generate any trace.
63///
64/// This processor is designed to be as fast as possible. Hence, it only keeps track of the current
65/// state of the processor (i.e. the stack, current clock cycle, current memory context, and free
66/// memory pointer).
67///
68/// # Stack Management
69/// A few key points about how the stack was designed for maximum performance:
70///
71/// - The stack has a fixed buffer size defined by `STACK_BUFFER_SIZE`.
72///     - This was observed to increase performance by at least 2x compared to using a `Vec` with
73///       `push()` & `pop()`.
74///     - We track the stack top and bottom using indices `stack_top_idx` and `stack_bot_idx`,
75///       respectively.
76/// - Since we are using a fixed-size buffer, we need to ensure that stack buffer accesses are not
77///   out of bounds. Naively, we could check for this on every access. However, every operation
78///   alters the stack depth by a predetermined amount, allowing us to precisely determine the
79///   minimum number of operations required to reach a stack buffer boundary, whether at the top or
80///   bottom.
81///     - For example, if the stack top is 10 elements away from the top boundary, and the stack
82///       bottom is 15 elements away from the bottom boundary, then we can safely execute 10
83///       operations that modify the stack depth with no bounds check.
84/// - When switching contexts (e.g., during a call or syscall), all elements past the first 16 are
85///   stored in an `ExecutionContextInfo` struct, and the stack is truncated to 16 elements. This
86///   will be restored when returning from the call or syscall.
87///
88/// # Clock Cycle Management
89/// - The clock cycle (`clk`) is managed in the same way as in `Process`. That is, it is incremented
90///   by 1 for every row that `Process` adds to the main trace.
91///     - It is important to do so because the clock cycle is used to determine the context ID for
92///       new execution contexts when using `call` or `dyncall`.
93/// - When executing a basic block, the clock cycle is not incremented for every individual
94///   operation for performance reasons.
95///     - Rather, we use `clk + operation_index` to determine the clock cycle when needed.
96///     - However this performance improvement is slightly offset by the need to parse operation
97///       batches exactly the same as `Process`. We will be able to recover the performance loss by
98///       redesigning the `BasicBlockNode`.
99#[derive(Debug)]
100pub struct FastProcessor {
101    /// The stack is stored in reverse order, so that the last element is at the top of the stack.
102    pub(super) stack: [Felt; STACK_BUFFER_SIZE],
103    /// The index of the top of the stack.
104    stack_top_idx: usize,
105    /// The index of the bottom of the stack.
106    stack_bot_idx: usize,
107    /// The counter which keeps track of the number of instructions that we can execute without
108    /// hitting the bounds of `stack`.
109    bounds_check_counter: usize,
110
111    /// The current clock cycle.
112    ///
113    /// However, when we are in a basic block, this corresponds to the clock cycle at which the
114    /// basic block was entered. Hence, given an operation, we need to add its index in the
115    /// block to this value to get the clock cycle.
116    pub(super) clk: RowIndex,
117
118    /// The current context ID.
119    pub(super) ctx: ContextId,
120
121    /// The free memory pointer.
122    pub(super) fmp: Felt,
123
124    /// Whether we are currently in a syscall.
125    in_syscall: bool,
126
127    /// The hash of the function that called into the current context, or `[ZERO, ZERO, ZERO,
128    /// ZERO]` if we are in the first context (i.e. when `call_stack` is empty).
129    pub(super) caller_hash: Word,
130
131    /// A map from (context_id, word_address) to the word stored starting at that memory location.
132    pub(super) memory: Memory,
133
134    /// A map storing metadata per call to the ACE chiplet.
135    pub(super) ace: Ace,
136
137    /// The call stack is used when starting a new execution context (from a `call`, `syscall` or
138    /// `dyncall`) to keep track of the information needed to return to the previous context upon
139    /// return. It is a stack since calls can be nested.
140    call_stack: Vec<ExecutionContextInfo>,
141
142    /// Whether to enable debug statements and tracing.
143    in_debug_mode: bool,
144}
145
146impl FastProcessor {
147    // CONSTRUCTORS
148    // ----------------------------------------------------------------------------------------------
149
150    /// Creates a new `FastProcessor` instance with the given stack inputs.
151    ///
152    /// The stack inputs are expected to be stored in reverse order. For example, if `stack_inputs =
153    /// [1,2,3]`, then the stack will be initialized as `[3,2,1,0,0,...]`, with `3` being on
154    /// top.
155    ///
156    /// # Panics
157    /// - Panics if the length of `stack_inputs` is greater than `MIN_STACK_DEPTH`.
158    pub fn new(stack_inputs: &[Felt]) -> Self {
159        Self::initialize(stack_inputs, false)
160    }
161
162    /// Creates a new `FastProcessor` instance with the given stack inputs, set to debug mode.
163    ///
164    /// The stack inputs are expected to be stored in reverse order. For example, if `stack_inputs =
165    /// [1,2,3]`, then the stack will be initialized as `[3,2,1,0,0,...]`, with `3` being on
166    /// top.
167    ///
168    /// # Panics
169    /// - Panics if the length of `stack_inputs` is greater than `MIN_STACK_DEPTH`.
170    pub fn new_debug(stack_inputs: &[Felt]) -> Self {
171        Self::initialize(stack_inputs, true)
172    }
173
174    fn initialize(stack_inputs: &[Felt], in_debug_mode: bool) -> Self {
175        assert!(stack_inputs.len() <= MIN_STACK_DEPTH);
176
177        let stack_top_idx = INITIAL_STACK_TOP_IDX;
178        let stack = {
179            let mut stack = [ZERO; STACK_BUFFER_SIZE];
180            let bottom_idx = stack_top_idx - stack_inputs.len();
181
182            stack[bottom_idx..stack_top_idx].copy_from_slice(stack_inputs);
183            stack
184        };
185
186        let stack_bot_idx = stack_top_idx - MIN_STACK_DEPTH;
187
188        let bounds_check_counter = stack_bot_idx;
189
190        FastProcessor {
191            stack,
192            stack_top_idx,
193            stack_bot_idx,
194            bounds_check_counter,
195            clk: 0_u32.into(),
196            ctx: 0_u32.into(),
197            fmp: Felt::new(FMP_MIN),
198            in_syscall: false,
199            caller_hash: EMPTY_WORD,
200            memory: Memory::new(),
201            call_stack: Vec::new(),
202            ace: Ace::default(),
203            in_debug_mode,
204        }
205    }
206
207    // ACCESSORS
208    // -------------------------------------------------------------------------------------------
209
210    /// Returns the stack, such that the top of the stack is at the last index of the returned
211    /// slice.
212    pub fn stack(&self) -> &[Felt] {
213        &self.stack[self.stack_bot_idx..self.stack_top_idx]
214    }
215
216    /// Returns the element on the stack at index `idx`.
217    #[inline(always)]
218    pub fn stack_get(&self, idx: usize) -> Felt {
219        self.stack[self.stack_top_idx - idx - 1]
220    }
221
222    /// Mutable variant of `stack_get()`.
223    #[inline(always)]
224    pub fn stack_get_mut(&mut self, idx: usize) -> &mut Felt {
225        &mut self.stack[self.stack_top_idx - idx - 1]
226    }
227
228    /// Returns the word on the stack starting at index `start_idx` in "stack order".
229    ///
230    /// That is, for `start_idx=0` the top element of the stack will be at the last position in the
231    /// word.
232    ///
233    /// For example, if the stack looks like this:
234    ///
235    /// top                                                       bottom
236    /// v                                                           v
237    /// a | b | c | d | e | f | g | h | i | j | k | l | m | n | o | p
238    ///
239    /// Then
240    /// - `stack_get_word(0)` returns `[d, c, b, a]`,
241    /// - `stack_get_word(1)` returns `[e, d, c ,b]`,
242    /// - etc.
243    #[inline(always)]
244    pub fn stack_get_word(&self, start_idx: usize) -> Word {
245        debug_assert!(start_idx < MIN_STACK_DEPTH);
246
247        let word_start_idx = self.stack_top_idx - start_idx - 4;
248        self.stack[range(word_start_idx, WORD_SIZE)].try_into().unwrap()
249    }
250
251    /// Returns the number of elements on the stack in the current context.
252    #[inline(always)]
253    pub fn stack_depth(&self) -> u32 {
254        (self.stack_top_idx - self.stack_bot_idx) as u32
255    }
256
257    // MUTATORS
258    // -------------------------------------------------------------------------------------------
259
260    /// Writes an element to the stack at the given index.
261    #[inline(always)]
262    pub fn stack_write(&mut self, idx: usize, element: Felt) {
263        self.stack[self.stack_top_idx - idx - 1] = element
264    }
265
266    /// Writes a word to the stack starting at the given index.
267    ///
268    /// The index is the index of the first element of the word, and the word is written in reverse
269    /// order.
270    #[inline(always)]
271    pub fn stack_write_word(&mut self, start_idx: usize, word: &Word) {
272        debug_assert!(start_idx < MIN_STACK_DEPTH);
273
274        let word_start_idx = self.stack_top_idx - start_idx - 4;
275        self.stack[range(word_start_idx, WORD_SIZE)].copy_from_slice(word)
276    }
277
278    /// Swaps the elements at the given indices on the stack.
279    #[inline(always)]
280    pub fn stack_swap(&mut self, idx1: usize, idx2: usize) {
281        let a = self.stack_get(idx1);
282        let b = self.stack_get(idx2);
283        self.stack_write(idx1, b);
284        self.stack_write(idx2, a);
285    }
286
287    // EXECUTE
288    // -------------------------------------------------------------------------------------------
289
290    /// Executes the given program and returns the stack outputs.
291    pub fn execute(
292        mut self,
293        program: &Program,
294        host: &mut impl Host,
295    ) -> Result<StackOutputs, ExecutionError> {
296        self.execute_impl(program, host)
297    }
298
299    /// Executes the given program and returns the stack outputs.
300    ///
301    /// This function is mainly split out of `execute()` for testing purposes so that we can access
302    /// the internal state of the `FastProcessor` after execution.
303    fn execute_impl(
304        &mut self,
305        program: &Program,
306        host: &mut impl Host,
307    ) -> Result<StackOutputs, ExecutionError> {
308        self.execute_mast_node(
309            program.entrypoint(),
310            program.mast_forest(),
311            program.kernel(),
312            host,
313        )?;
314
315        StackOutputs::new(
316            self.stack[self.stack_bot_idx..self.stack_top_idx]
317                .iter()
318                .rev()
319                .copied()
320                .collect(),
321        )
322        .map_err(|_| {
323            ExecutionError::OutputStackOverflow(
324                self.stack_top_idx - self.stack_bot_idx - MIN_STACK_DEPTH,
325            )
326        })
327    }
328
329    // NODE EXECUTORS
330    // --------------------------------------------------------------------------------------------
331
332    fn execute_mast_node(
333        &mut self,
334        node_id: MastNodeId,
335        program: &MastForest,
336        kernel: &Kernel,
337        host: &mut impl Host,
338    ) -> Result<(), ExecutionError> {
339        let node = program
340            .get_node_by_id(node_id)
341            .ok_or(ExecutionError::MastNodeNotFoundInForest { node_id })?;
342
343        // Note: we only run this in case there are Trace events associated with the node. However,
344        // if there are assembly ops in the "before enter" list, we will cycle through them and
345        // ignore them, resulting in a drop of performance. We should remove this after trace events
346        // are removed from decorators - or if decorators are removed entirely.
347        //
348        // A similar reasoning applies to the "after exit" list.
349        for &decorator_id in node.before_enter() {
350            self.execute_decorator(&program[decorator_id], 0, host)?;
351        }
352
353        match node {
354            MastNode::Block(basic_block_node) => {
355                self.execute_basic_block_node(basic_block_node, program, host)?
356            },
357            MastNode::Join(join_node) => {
358                self.execute_join_node(join_node, program, kernel, host)?
359            },
360            MastNode::Split(split_node) => {
361                self.execute_split_node(split_node, program, kernel, host)?
362            },
363            MastNode::Loop(loop_node) => {
364                self.execute_loop_node(loop_node, program, kernel, host)?
365            },
366            MastNode::Call(call_node) => {
367                self.execute_call_node(call_node, program, kernel, host)?
368            },
369            MastNode::Dyn(dyn_node) => self.execute_dyn_node(dyn_node, program, kernel, host)?,
370            MastNode::External(external_node) => {
371                self.execute_external_node(external_node, kernel, host)?
372            },
373        }
374
375        for &decorator_id in node.after_exit() {
376            self.execute_decorator(&program[decorator_id], 0, host)?;
377        }
378
379        Ok(())
380    }
381
382    fn execute_join_node(
383        &mut self,
384        join_node: &JoinNode,
385        program: &MastForest,
386        kernel: &Kernel,
387        host: &mut impl Host,
388    ) -> Result<(), ExecutionError> {
389        // Corresponds to the row inserted for the JOIN operation added to the trace.
390        self.clk += 1_u32;
391
392        self.execute_mast_node(join_node.first(), program, kernel, host)?;
393        self.execute_mast_node(join_node.second(), program, kernel, host)?;
394
395        // Corresponds to the row inserted for the END operation added to the trace.
396        self.clk += 1_u32;
397
398        Ok(())
399    }
400
401    fn execute_split_node(
402        &mut self,
403        split_node: &SplitNode,
404        program: &MastForest,
405        kernel: &Kernel,
406        host: &mut impl Host,
407    ) -> Result<(), ExecutionError> {
408        // Corresponds to the row inserted for the SPLIT operation added to the trace.
409        self.clk += 1_u32;
410
411        let condition = self.stack_get(0);
412
413        // drop the condition from the stack
414        self.decrement_stack_size();
415
416        // execute the appropriate branch
417        let ret = if condition == ONE {
418            self.execute_mast_node(split_node.on_true(), program, kernel, host)
419        } else if condition == ZERO {
420            self.execute_mast_node(split_node.on_false(), program, kernel, host)
421        } else {
422            Err(ExecutionError::not_binary_value_if(condition, &ErrorContext::default()))
423        };
424
425        // Corresponds to the row inserted for the END operation added to the trace.
426        self.clk += 1_u32;
427
428        ret
429    }
430
431    fn execute_loop_node(
432        &mut self,
433        loop_node: &LoopNode,
434        program: &MastForest,
435        kernel: &Kernel,
436        host: &mut impl Host,
437    ) -> Result<(), ExecutionError> {
438        // Corresponds to the row inserted for the LOOP operation added to the trace.
439        self.clk += 1_u32;
440
441        // The loop condition is checked after the loop body is executed.
442        let mut condition = self.stack_get(0);
443
444        // drop the condition from the stack
445        self.decrement_stack_size();
446
447        // execute the loop body as long as the condition is true
448        while condition == ONE {
449            self.execute_mast_node(loop_node.body(), program, kernel, host)?;
450
451            // check the loop condition, and drop it from the stack
452            condition = self.stack_get(0);
453            self.decrement_stack_size();
454
455            // this clock increment is for the row inserted for the `REPEAT` node added to
456            // the trace on each iteration. It needs to be at the end of this loop (instead
457            // of at the beginning), otherwise we get an off-by-one error when comparing
458            // with [crate::Process].
459            if condition == ONE {
460                self.clk += 1_u32;
461            }
462        }
463
464        // Corresponds to the row inserted for the END operation added to the trace.
465        self.clk += 1_u32;
466
467        if condition == ZERO {
468            Ok(())
469        } else {
470            Err(ExecutionError::not_binary_value_loop(condition, &ErrorContext::default()))
471        }
472    }
473
474    fn execute_call_node(
475        &mut self,
476        call_node: &CallNode,
477        program: &MastForest,
478        kernel: &Kernel,
479        host: &mut impl Host,
480    ) -> Result<(), ExecutionError> {
481        // Corresponds to the row inserted for the CALL or SYSCALL operation added to the trace.
482        self.clk += 1_u32;
483
484        // call or syscall are not allowed inside a syscall
485        if self.in_syscall {
486            let instruction = if call_node.is_syscall() { "syscall" } else { "call" };
487            return Err(ExecutionError::CallInSyscall(instruction));
488        }
489
490        let callee_hash = program
491            .get_node_by_id(call_node.callee())
492            .ok_or(ExecutionError::MastNodeNotFoundInForest { node_id: call_node.callee() })?
493            .digest();
494
495        self.save_context_and_truncate_stack();
496
497        if call_node.is_syscall() {
498            // check if the callee is in the kernel
499            if !kernel.contains_proc(callee_hash) {
500                return Err(ExecutionError::syscall_target_not_in_kernel(
501                    callee_hash,
502                    &ErrorContext::default(),
503                ));
504            }
505
506            // set the system registers to the syscall context
507            self.ctx = ContextId::root();
508            self.fmp = SYSCALL_FMP_MIN.into();
509            self.in_syscall = true;
510        } else {
511            // set the system registers to the callee context
512            self.ctx = self.clk.into();
513            self.fmp = Felt::new(FMP_MIN);
514            self.caller_hash = callee_hash.into();
515        }
516
517        // Execute the callee.
518        self.execute_mast_node(call_node.callee(), program, kernel, host)?;
519
520        // when returning from a function call or a syscall, restore the context of the
521        // system registers and the operand stack to what it was prior to
522        // the call.
523        self.restore_context()?;
524
525        // Corresponds to the row inserted for the END operation added to the trace.
526        self.clk += 1_u32;
527
528        Ok(())
529    }
530
531    fn execute_dyn_node(
532        &mut self,
533        dyn_node: &DynNode,
534        program: &MastForest,
535        kernel: &Kernel,
536        host: &mut impl Host,
537    ) -> Result<(), ExecutionError> {
538        // Corresponds to the row inserted for the DYN or DYNCALL operation added to the trace.
539        self.clk += 1_u32;
540
541        // dyn calls are not allowed inside a syscall
542        if dyn_node.is_dyncall() && self.in_syscall {
543            return Err(ExecutionError::CallInSyscall("dyncall"));
544        }
545
546        // Retrieve callee hash from memory, using stack top as the memory address.
547        let callee_hash = {
548            let mem_addr = self.stack_get(0);
549            self.memory.read_word(self.ctx, mem_addr, self.clk).copied()?
550        };
551
552        // Drop the memory address from the stack. This needs to be done BEFORE saving the context,
553        // because the next instruction starts with a "shifted left" stack.
554        self.decrement_stack_size();
555
556        // For dyncall, save the context and reset it.
557        if dyn_node.is_dyncall() {
558            self.save_context_and_truncate_stack();
559            self.ctx = self.clk.into();
560            self.fmp = Felt::new(FMP_MIN);
561            self.caller_hash = callee_hash;
562        };
563
564        // if the callee is not in the program's MAST forest, try to find a MAST forest for it in
565        // the host (corresponding to an external library loaded in the host); if none are
566        // found, return an error.
567        match program.find_procedure_root(callee_hash.into()) {
568            Some(callee_id) => self.execute_mast_node(callee_id, program, kernel, host)?,
569            None => {
570                let mast_forest = host.get_mast_forest(&callee_hash.into()).ok_or_else(|| {
571                    ExecutionError::dynamic_node_not_found(
572                        callee_hash.into(),
573                        &ErrorContext::default(),
574                    )
575                })?;
576
577                // We limit the parts of the program that can be called externally to procedure
578                // roots, even though MAST doesn't have that restriction.
579                let root_id = mast_forest.find_procedure_root(callee_hash.into()).ok_or(
580                    ExecutionError::malfored_mast_forest_in_host(
581                        callee_hash.into(),
582                        &ErrorContext::default(),
583                    ),
584                )?;
585
586                self.execute_mast_node(root_id, &mast_forest, kernel, host)?
587            },
588        }
589
590        // For dyncall, restore the context.
591        if dyn_node.is_dyncall() {
592            self.restore_context()?;
593        }
594
595        // Corresponds to the row inserted for the END operation added to the trace.
596        self.clk += 1_u32;
597
598        Ok(())
599    }
600
601    fn execute_external_node(
602        &mut self,
603        external_node: &ExternalNode,
604        kernel: &Kernel,
605        host: &mut impl Host,
606    ) -> Result<(), ExecutionError> {
607        let (root_id, mast_forest) = resolve_external_node(external_node, host)?;
608
609        self.execute_mast_node(root_id, &mast_forest, kernel, host)
610    }
611
612    // Note: when executing individual ops, we do not increment the clock by 1 at every iteration
613    // for performance reasons (~25% performance drop). Hence, `self.clk` cannot be used directly to
614    // determine the number of operations executed in a program.
615    fn execute_basic_block_node(
616        &mut self,
617        basic_block_node: &BasicBlockNode,
618        program: &MastForest,
619        host: &mut impl Host,
620    ) -> Result<(), ExecutionError> {
621        // Corresponds to the row inserted for the SPAN operation added to the trace.
622        self.clk += 1_u32;
623
624        let mut batch_offset_in_block = 0;
625        let mut op_batches = basic_block_node.op_batches().iter();
626        let mut decorator_ids = basic_block_node.decorator_iter();
627
628        // execute first op batch
629        if let Some(first_op_batch) = op_batches.next() {
630            self.execute_op_batch(
631                first_op_batch,
632                &mut decorator_ids,
633                batch_offset_in_block,
634                program,
635                host,
636            )?;
637            batch_offset_in_block += first_op_batch.ops().len();
638        }
639
640        // execute the rest of the op batches
641        for op_batch in op_batches {
642            // increment clock to account for `RESPAN`
643            self.clk += 1_u32;
644
645            self.execute_op_batch(
646                op_batch,
647                &mut decorator_ids,
648                batch_offset_in_block,
649                program,
650                host,
651            )?;
652            batch_offset_in_block += op_batch.ops().len();
653        }
654
655        // update clock with all the operations that executed
656        self.clk += batch_offset_in_block as u32;
657
658        // Corresponds to the row inserted for the END operation added to the trace.
659        self.clk += 1_u32;
660
661        // execute any decorators which have not been executed during span ops execution; this can
662        // happen for decorators appearing after all operations in a block. these decorators are
663        // executed after SPAN block is closed to make sure the VM clock cycle advances beyond the
664        // last clock cycle of the SPAN block ops.
665        for &decorator_id in decorator_ids {
666            let decorator = program
667                .get_decorator_by_id(decorator_id)
668                .ok_or(ExecutionError::DecoratorNotFoundInForest { decorator_id })?;
669            self.execute_decorator(decorator, 0, host)?;
670        }
671
672        Ok(())
673    }
674
675    #[inline(always)]
676    fn execute_op_batch(
677        &mut self,
678        batch: &OpBatch,
679        decorators: &mut DecoratorIterator,
680        batch_offset_in_block: usize,
681        program: &MastForest,
682        host: &mut impl Host,
683    ) -> Result<(), ExecutionError> {
684        let op_counts = batch.op_counts();
685        let mut op_idx_in_group = 0;
686        let mut group_idx = 0;
687        let mut next_group_idx = 1;
688
689        // round up the number of groups to be processed to the next power of two; we do this
690        // because the processor requires the number of groups to be either 1, 2, 4, or 8; if
691        // the actual number of groups is smaller, we'll pad the batch with NOOPs at the end
692        let num_batch_groups = batch.num_groups().next_power_of_two();
693
694        // execute operations in the batch one by one
695        for (op_idx_in_batch, op) in batch.ops().iter().enumerate() {
696            while let Some(&decorator_id) =
697                decorators.next_filtered(batch_offset_in_block + op_idx_in_batch)
698            {
699                let decorator = program
700                    .get_decorator_by_id(decorator_id)
701                    .ok_or(ExecutionError::DecoratorNotFoundInForest { decorator_id })?;
702                self.execute_decorator(decorator, op_idx_in_batch, host)?;
703            }
704
705            // decode and execute the operation
706            self.execute_op(op, batch_offset_in_block + op_idx_in_batch, program, host)?;
707
708            // if the operation carries an immediate value, the value is stored at the next group
709            // pointer; so, we advance the pointer to the following group
710            let has_imm = op.imm_value().is_some();
711            if has_imm {
712                next_group_idx += 1;
713            }
714
715            // determine if we've executed all non-decorator operations in a group
716            if op_idx_in_group == op_counts[group_idx] - 1 {
717                // if we are at the end of the group, first check if the operation carries an
718                // immediate value
719                if has_imm {
720                    // an operation with an immediate value cannot be the last operation in a group
721                    // so, we need execute a NOOP after it. In this processor, we increment the
722                    // clock to account for the NOOP.
723                    debug_assert!(op_idx_in_group < OP_GROUP_SIZE - 1, "invalid op index");
724                    self.clk += 1_u32;
725                }
726
727                // then, move to the next group and reset operation index
728                group_idx = next_group_idx;
729                next_group_idx += 1;
730                op_idx_in_group = 0;
731            } else {
732                op_idx_in_group += 1;
733            }
734        }
735
736        // make sure we execute the required number of operation groups; this would happen when the
737        // actual number of operation groups was not a power of two. In this processor, this
738        // corresponds to incrementing the clock by the number of empty op groups (i.e. 1 NOOP
739        // executed per missing op group).
740
741        self.clk += (num_batch_groups - group_idx) as u32;
742
743        Ok(())
744    }
745
746    /// Executes the specified decorator
747    fn execute_decorator(
748        &mut self,
749        decorator: &Decorator,
750        op_idx_in_batch: usize,
751        host: &mut impl Host,
752    ) -> Result<(), ExecutionError> {
753        match decorator {
754            Decorator::Debug(options) => {
755                if self.in_debug_mode {
756                    host.on_debug(ProcessState::new_fast(self, op_idx_in_batch), options)?;
757                }
758            },
759            Decorator::AsmOp(_assembly_op) => {
760                // do nothing
761            },
762            Decorator::Trace(id) => {
763                host.on_trace(ProcessState::new_fast(self, op_idx_in_batch), *id)?;
764            },
765        };
766        Ok(())
767    }
768
769    fn execute_op(
770        &mut self,
771        operation: &Operation,
772        op_idx: usize,
773        program: &MastForest,
774        host: &mut impl Host,
775    ) -> Result<(), ExecutionError> {
776        if self.bounds_check_counter == 0 {
777            let err_str = if self.stack_top_idx - MIN_STACK_DEPTH == 0 {
778                "stack underflow"
779            } else {
780                "stack overflow"
781            };
782            return Err(ExecutionError::FailedToExecuteProgram(err_str));
783        }
784
785        match operation {
786            // ----- system operations ------------------------------------------------------------
787            Operation::Noop => {
788                // do nothing
789            },
790            Operation::Assert(err_code) => self.op_assert(*err_code, op_idx, host, program)?,
791            Operation::FmpAdd => self.op_fmpadd(),
792            Operation::FmpUpdate => self.op_fmpupdate()?,
793            Operation::SDepth => self.op_sdepth(),
794            Operation::Caller => self.op_caller()?,
795            Operation::Clk => self.op_clk(op_idx)?,
796            Operation::Emit(event_id) => self.op_emit(*event_id, op_idx, host)?,
797
798            // ----- flow control operations ------------------------------------------------------
799            // control flow operations are never executed directly
800            Operation::Join => unreachable!("control flow operation"),
801            Operation::Split => unreachable!("control flow operation"),
802            Operation::Loop => unreachable!("control flow operation"),
803            Operation::Call => unreachable!("control flow operation"),
804            Operation::SysCall => unreachable!("control flow operation"),
805            Operation::Dyn => unreachable!("control flow operation"),
806            Operation::Dyncall => unreachable!("control flow operation"),
807            Operation::Span => unreachable!("control flow operation"),
808            Operation::Repeat => unreachable!("control flow operation"),
809            Operation::Respan => unreachable!("control flow operation"),
810            Operation::End => unreachable!("control flow operation"),
811            Operation::Halt => unreachable!("control flow operation"),
812
813            // ----- field operations -------------------------------------------------------------
814            Operation::Add => self.op_add()?,
815            Operation::Neg => self.op_neg()?,
816            Operation::Mul => self.op_mul()?,
817            Operation::Inv => self.op_inv(op_idx)?,
818            Operation::Incr => self.op_incr()?,
819            Operation::And => self.op_and()?,
820            Operation::Or => self.op_or()?,
821            Operation::Not => self.op_not()?,
822            Operation::Eq => self.op_eq()?,
823            Operation::Eqz => self.op_eqz()?,
824            Operation::Expacc => self.op_expacc(),
825            Operation::Ext2Mul => self.op_ext2mul(),
826
827            // ----- u32 operations ---------------------------------------------------------------
828            Operation::U32split => self.op_u32split()?,
829            Operation::U32add => self.op_u32add()?,
830            Operation::U32add3 => self.op_u32add3()?,
831            Operation::U32sub => self.op_u32sub(op_idx)?,
832            Operation::U32mul => self.op_u32mul()?,
833            Operation::U32madd => self.op_u32madd()?,
834            Operation::U32div => self.op_u32div(op_idx)?,
835            Operation::U32and => self.op_u32and()?,
836            Operation::U32xor => self.op_u32xor()?,
837            Operation::U32assert2(err_code) => self.op_u32assert2(*err_code)?,
838
839            // ----- stack manipulation -----------------------------------------------------------
840            Operation::Pad => self.op_pad(),
841            Operation::Drop => self.decrement_stack_size(),
842            Operation::Dup0 => self.dup_nth(0),
843            Operation::Dup1 => self.dup_nth(1),
844            Operation::Dup2 => self.dup_nth(2),
845            Operation::Dup3 => self.dup_nth(3),
846            Operation::Dup4 => self.dup_nth(4),
847            Operation::Dup5 => self.dup_nth(5),
848            Operation::Dup6 => self.dup_nth(6),
849            Operation::Dup7 => self.dup_nth(7),
850            Operation::Dup9 => self.dup_nth(9),
851            Operation::Dup11 => self.dup_nth(11),
852            Operation::Dup13 => self.dup_nth(13),
853            Operation::Dup15 => self.dup_nth(15),
854            Operation::Swap => self.op_swap(),
855            Operation::SwapW => self.swapw_nth(1),
856            Operation::SwapW2 => self.swapw_nth(2),
857            Operation::SwapW3 => self.swapw_nth(3),
858            Operation::SwapDW => self.op_swap_double_word(),
859            Operation::MovUp2 => self.rotate_left(3),
860            Operation::MovUp3 => self.rotate_left(4),
861            Operation::MovUp4 => self.rotate_left(5),
862            Operation::MovUp5 => self.rotate_left(6),
863            Operation::MovUp6 => self.rotate_left(7),
864            Operation::MovUp7 => self.rotate_left(8),
865            Operation::MovUp8 => self.rotate_left(9),
866            Operation::MovDn2 => self.rotate_right(3),
867            Operation::MovDn3 => self.rotate_right(4),
868            Operation::MovDn4 => self.rotate_right(5),
869            Operation::MovDn5 => self.rotate_right(6),
870            Operation::MovDn6 => self.rotate_right(7),
871            Operation::MovDn7 => self.rotate_right(8),
872            Operation::MovDn8 => self.rotate_right(9),
873            Operation::CSwap => self.op_cswap()?,
874            Operation::CSwapW => self.op_cswapw()?,
875
876            // ----- input / output ---------------------------------------------------------------
877            Operation::Push(element) => self.op_push(*element),
878            Operation::AdvPop => self.op_advpop(op_idx, host)?,
879            Operation::AdvPopW => self.op_advpopw(op_idx, host)?,
880            Operation::MLoadW => self.op_mloadw(op_idx)?,
881            Operation::MStoreW => self.op_mstorew(op_idx)?,
882            Operation::MLoad => self.op_mload()?,
883            Operation::MStore => self.op_mstore()?,
884            Operation::MStream => self.op_mstream(op_idx)?,
885            Operation::Pipe => self.op_pipe(op_idx, host)?,
886
887            // ----- cryptographic operations -----------------------------------------------------
888            Operation::HPerm => self.op_hperm(),
889            Operation::MpVerify(err_code) => self.op_mpverify(*err_code, host, program)?,
890            Operation::MrUpdate => self.op_mrupdate(host)?,
891            Operation::FriE2F4 => self.op_fri_ext2fold4()?,
892            Operation::HornerBase => self.op_horner_eval_base(op_idx)?,
893            Operation::HornerExt => self.op_horner_eval_ext(op_idx)?,
894            Operation::ArithmeticCircuitEval => self.arithmetic_circuit_eval(op_idx)?,
895        }
896
897        Ok(())
898    }
899
900    // HELPERS
901    // ----------------------------------------------------------------------------------------------
902
903    /// Increments the stack top pointer by 1.
904    ///
905    /// The bottom of the stack is never affected by this operation.
906    #[inline(always)]
907    fn increment_stack_size(&mut self) {
908        self.stack_top_idx += 1;
909        self.update_bounds_check_counter();
910    }
911
912    /// Decrements the stack top pointer by 1.
913    ///
914    /// The bottom of the stack is only decremented in cases where the stack depth would become less
915    /// than 16.
916    #[inline(always)]
917    fn decrement_stack_size(&mut self) {
918        self.stack_top_idx -= 1;
919        self.stack_bot_idx = min(self.stack_bot_idx, self.stack_top_idx - MIN_STACK_DEPTH);
920        self.update_bounds_check_counter();
921    }
922
923    /// Returns the size of the stack.
924    #[inline(always)]
925    fn stack_size(&self) -> usize {
926        self.stack_top_idx - self.stack_bot_idx
927    }
928
929    /// Updates the bounds check counter.
930    ///
931    /// The bounds check counter is decremented by 1. If it reaches 0, it is reset to the minimum of
932    /// the stack depth from the low end and the high end of the stack buffer.
933    ///
934    /// The purpose of the bounds check counter is to ensure that we never access the stack buffer
935    /// at an out-of-bounds index.
936    #[inline(always)]
937    fn update_bounds_check_counter(&mut self) {
938        self.bounds_check_counter -= 1;
939
940        if self.bounds_check_counter == 0 {
941            // We will need to check the bounds either because we reach the low end or the high end
942            // of the stack buffer. There are two worst cases that we are concerned about:
943            // - we only execute instructions that decrease stack depth
944            // - we only execute instructions that increase stack depth
945            //
946            // In the first case, we will hit the low end of the stack buffer; in the second case,
947            // we will hit the high end of the stack buffer. We set the number of instructions that
948            // is safe to execute to be the minimum of these two worst cases.
949
950            self.bounds_check_counter =
951                min(self.stack_top_idx - MIN_STACK_DEPTH, STACK_BUFFER_SIZE - self.stack_top_idx);
952        }
953    }
954
955    /// Saves the current execution context and truncates the stack to 16 elements in preparation to
956    /// start a new execution context.
957    fn save_context_and_truncate_stack(&mut self) {
958        let overflow_stack = if self.stack_size() > MIN_STACK_DEPTH {
959            // save the overflow stack, and zero out the buffer.
960            //
961            // Note: we need to zero the overflow buffer, since the new context expects ZERO's to be
962            // pulled in if they decrement the stack size (e.g. by executing a `drop`).
963            let overflow_stack =
964                self.stack[self.stack_bot_idx..self.stack_top_idx - MIN_STACK_DEPTH].to_vec();
965            self.stack[self.stack_bot_idx..self.stack_top_idx - MIN_STACK_DEPTH].fill(ZERO);
966
967            overflow_stack
968        } else {
969            Vec::new()
970        };
971
972        self.stack_bot_idx = self.stack_top_idx - MIN_STACK_DEPTH;
973
974        self.call_stack.push(ExecutionContextInfo {
975            overflow_stack,
976            ctx: self.ctx,
977            fn_hash: self.caller_hash,
978            fmp: self.fmp,
979        });
980    }
981
982    /// Restores the execution context to the state it was in before the last `call`, `syscall` or
983    /// `dyncall`.
984    ///
985    /// This includes restoring the overflow stack and the system parameters.
986    ///
987    /// # Errors
988    /// - Returns an error if the overflow stack is larger than the space available in the stack
989    ///   buffer.
990    fn restore_context(&mut self) -> Result<(), ExecutionError> {
991        // when a call/dyncall/syscall node ends, stack depth must be exactly 16.
992        if self.stack_size() > MIN_STACK_DEPTH {
993            return Err(ExecutionError::invalid_stack_depth_on_return(
994                self.stack_size(),
995                &ErrorContext::default(),
996            ));
997        }
998
999        let ctx_info = self
1000            .call_stack
1001            .pop()
1002            .expect("execution context stack should never be empty when restoring context");
1003
1004        // restore the overflow stack
1005        {
1006            let overflow_len = ctx_info.overflow_stack.len();
1007            if overflow_len > self.stack_bot_idx {
1008                return Err(ExecutionError::FailedToExecuteProgram(
1009                    "stack underflow when restoring context",
1010                ));
1011            }
1012
1013            self.stack[range(self.stack_bot_idx - overflow_len, overflow_len)]
1014                .copy_from_slice(&ctx_info.overflow_stack);
1015            self.stack_bot_idx -= overflow_len;
1016        }
1017
1018        // restore system parameters
1019        self.ctx = ctx_info.ctx;
1020        self.fmp = ctx_info.fmp;
1021        self.in_syscall = false;
1022        self.caller_hash = ctx_info.fn_hash;
1023
1024        Ok(())
1025    }
1026}
1027
1028// EXECUTION CONTEXT INFO
1029// ===============================================================================================
1030
1031/// Information about the execution context.
1032///
1033/// This struct is used to keep track of the information needed to return to the previous context
1034/// upon return from a `call`, `syscall` or `dyncall`.
1035#[derive(Debug)]
1036struct ExecutionContextInfo {
1037    /// This stores all the elements on the stack at the call site, excluding the top 16 elements.
1038    /// This corresponds to the overflow table in [crate::Process].
1039    overflow_stack: Vec<Felt>,
1040    ctx: ContextId,
1041    fn_hash: Word,
1042    fmp: Felt,
1043}