miden_processor/parallel/
mod.rs

1use alloc::{boxed::Box, vec::Vec};
2use core::{mem::MaybeUninit, ops::ControlFlow};
3
4use itertools::Itertools;
5use miden_air::{
6    FieldElement, RowIndex,
7    trace::{
8        CLK_COL_IDX, CTX_COL_IDX, DECODER_TRACE_OFFSET, DECODER_TRACE_WIDTH, FN_HASH_RANGE,
9        MIN_TRACE_LEN, PADDED_TRACE_WIDTH, STACK_TRACE_OFFSET, STACK_TRACE_WIDTH, SYS_TRACE_WIDTH,
10        TRACE_WIDTH,
11        decoder::{
12            ADDR_COL_IDX, GROUP_COUNT_COL_IDX, HASHER_STATE_OFFSET, IN_SPAN_COL_IDX,
13            NUM_HASHER_COLUMNS, NUM_OP_BATCH_FLAGS, NUM_OP_BITS, NUM_USER_OP_HELPERS,
14            OP_BATCH_FLAGS_OFFSET, OP_BITS_EXTRA_COLS_OFFSET, OP_BITS_OFFSET, OP_INDEX_COL_IDX,
15        },
16        main_trace::MainTrace,
17        stack::{B0_COL_IDX, B1_COL_IDX, H0_COL_IDX, STACK_TOP_OFFSET},
18    },
19};
20use miden_core::{
21    Felt, Kernel, ONE, OPCODE_PUSH, Operation, QuadFelt, StarkField, WORD_SIZE, Word, ZERO,
22    mast::{BasicBlockNode, MastForest, MastNode, MastNodeExt, MastNodeId, OpBatch},
23    precompile::PrecompileTranscriptState,
24    stack::MIN_STACK_DEPTH,
25    utils::{range, uninit_vector},
26};
27use rayon::prelude::*;
28use winter_prover::{crypto::RandomCoin, math::batch_inversion};
29
30use crate::{
31    ChipletsLengths, ColMatrix, ContextId, ErrorContext, ExecutionError, ExecutionTrace,
32    ProcessState, TraceLenSummary,
33    chiplets::{Chiplets, CircuitEvaluation, MAX_NUM_ACE_WIRES, PTR_OFFSET_ELEM, PTR_OFFSET_WORD},
34    continuation_stack::Continuation,
35    crypto::RpoRandomCoin,
36    decoder::{
37        AuxTraceBuilder as DecoderAuxTraceBuilder, BasicBlockContext,
38        block_stack::ExecutionContextInfo,
39    },
40    errors::AceError,
41    fast::{
42        ExecutionOutput, NoopTracer, Tracer,
43        execution_tracer::TraceGenerationContext,
44        trace_state::{
45            AceReplay, AdviceReplay, BitwiseOp, BitwiseReplay, CoreTraceFragmentContext,
46            DecoderState, ExecutionContextSystemInfo, HasherOp, HasherRequestReplay,
47            HasherResponseReplay, KernelReplay, MemoryReadsReplay, MemoryWritesReplay,
48            NodeExecutionState, NodeFlags,
49        },
50    },
51    host::default::NoopHost,
52    processor::{
53        MemoryInterface, OperationHelperRegisters, Processor, StackInterface, SystemInterface,
54    },
55    range::RangeChecker,
56    stack::AuxTraceBuilder as StackAuxTraceBuilder,
57    trace::{AuxTraceBuilders, NUM_RAND_ROWS},
58    utils::split_u32_into_u16,
59};
60
61pub const CORE_TRACE_WIDTH: usize = SYS_TRACE_WIDTH + DECODER_TRACE_WIDTH + STACK_TRACE_WIDTH;
62
63mod basic_block;
64mod call;
65mod r#dyn;
66mod join;
67mod r#loop;
68
69mod split;
70mod trace_builder;
71
72#[cfg(test)]
73mod tests;
74
75// BUILD TRACE
76// ================================================================================================
77
78/// Builds the main trace from the provided trace states in parallel.
79pub fn build_trace(
80    execution_output: ExecutionOutput,
81    trace_generation_context: TraceGenerationContext,
82    program_hash: Word,
83    kernel: Kernel,
84) -> ExecutionTrace {
85    let TraceGenerationContext {
86        core_trace_contexts,
87        range_checker_replay,
88        memory_writes,
89        bitwise_replay: bitwise,
90        kernel_replay,
91        hasher_for_chiplet,
92        ace_replay,
93        final_pc_transcript,
94        fragment_size,
95    } = trace_generation_context;
96
97    let chiplets = initialize_chiplets(
98        kernel.clone(),
99        &core_trace_contexts,
100        memory_writes,
101        bitwise,
102        kernel_replay,
103        hasher_for_chiplet,
104        ace_replay,
105    );
106
107    let range_checker = initialize_range_checker(range_checker_replay, &chiplets);
108
109    let fragments = generate_core_trace_fragments(core_trace_contexts, fragment_size);
110
111    // Calculate trace length
112    let core_trace_len = {
113        let core_trace_len: usize = fragments.iter().map(|f| f.row_count()).sum();
114
115        // TODO(plafer): We need to do a "- 1" here to be consistent with Process::execute(), which
116        // has a bug that causes it to not always insert a HALT row at the end of execution,
117        // documented in [#1383](https://github.com/0xMiden/miden-vm/issues/1383). We correctly insert a HALT row
118        // when generating the core trace fragments, so this "- 1" accounts for that extra row.
119        // We should remove this "- 1" once Process::execute() is fixed or removed entirely.
120        core_trace_len - 1
121    };
122
123    // Get the number of rows for the range checker
124    let range_table_len = range_checker.get_number_range_checker_rows();
125
126    let trace_len_summary =
127        TraceLenSummary::new(core_trace_len, range_table_len, ChipletsLengths::new(&chiplets));
128
129    // Compute the final main trace length, after accounting for random rows
130    let main_trace_len =
131        compute_main_trace_length(core_trace_len, range_table_len, chiplets.trace_len());
132
133    let (core_trace_columns, (range_checker_trace, chiplets_trace)) = rayon::join(
134        || combine_fragments(fragments, main_trace_len),
135        || {
136            rayon::join(
137                || {
138                    range_checker.into_trace_with_table(
139                        range_table_len,
140                        main_trace_len,
141                        NUM_RAND_ROWS,
142                    )
143                },
144                || chiplets.into_trace(main_trace_len, NUM_RAND_ROWS, final_pc_transcript.state()),
145            )
146        },
147    );
148
149    // Padding to make the number of columns a multiple of 8 i.e., the RPO permutation rate
150    let padding = vec![vec![ZERO; main_trace_len]; PADDED_TRACE_WIDTH - TRACE_WIDTH];
151
152    // Chain all trace columns together
153    let mut trace_columns: Vec<Vec<Felt>> = core_trace_columns
154        .into_iter()
155        .chain(range_checker_trace.trace)
156        .chain(chiplets_trace.trace)
157        .chain(padding)
158        .collect();
159
160    // Initialize random element generator using program hash
161    let mut rng = RpoRandomCoin::new(program_hash);
162
163    // Inject random values into the last NUM_RAND_ROWS rows for all columns
164    for i in main_trace_len - NUM_RAND_ROWS..main_trace_len {
165        for column in trace_columns.iter_mut() {
166            column[i] = rng.draw().expect("failed to draw a random value");
167        }
168    }
169
170    // Create the MainTrace
171    let main_trace = {
172        let last_program_row = RowIndex::from((core_trace_len as u32).saturating_sub(1));
173        let col_matrix = ColMatrix::new(trace_columns);
174        MainTrace::new(col_matrix, last_program_row)
175    };
176
177    // Create aux trace builders
178    let aux_trace_builders = AuxTraceBuilders {
179        decoder: DecoderAuxTraceBuilder::default(),
180        range: range_checker_trace.aux_builder,
181        chiplets: chiplets_trace.aux_builder,
182        stack: StackAuxTraceBuilder,
183    };
184
185    ExecutionTrace::new_from_parts(
186        program_hash,
187        kernel,
188        execution_output,
189        main_trace,
190        aux_trace_builders,
191        trace_len_summary,
192    )
193}
194
195fn compute_main_trace_length(
196    core_trace_len: usize,
197    range_table_len: usize,
198    chiplets_trace_len: usize,
199) -> usize {
200    // Get the trace length required to hold all execution trace steps
201    let max_len = range_table_len.max(core_trace_len).max(chiplets_trace_len);
202
203    // Pad the trace length to the next power of two and ensure that there is space for random
204    // rows
205    let trace_len = (max_len + NUM_RAND_ROWS).next_power_of_two();
206    core::cmp::max(trace_len, MIN_TRACE_LEN)
207}
208
209/// Generates core trace fragments in parallel from the provided trace fragment contexts.
210fn generate_core_trace_fragments(
211    core_trace_contexts: Vec<CoreTraceFragmentContext>,
212    fragment_size: usize,
213) -> Vec<CoreTraceFragment> {
214    // Save the first stack top for initialization
215    let first_stack_top = if let Some(first_context) = core_trace_contexts.first() {
216        first_context.state.stack.stack_top.to_vec()
217    } else {
218        vec![ZERO; MIN_STACK_DEPTH]
219    };
220
221    // Save the first system state for initialization
222    let first_system_state = [
223        ZERO, // clk starts at 0
224        ZERO, // ctx starts at 0 (root context)
225        ZERO, // fn_hash[0] starts as 0
226        ZERO, // fn_hash[1] starts as 0
227        ZERO, // fn_hash[2] starts as 0
228        ZERO, // fn_hash[3] starts as 0
229    ];
230
231    // Build the core trace fragments in parallel
232    let fragment_results: Vec<(
233        CoreTraceFragment,
234        [Felt; STACK_TRACE_WIDTH],
235        [Felt; SYS_TRACE_WIDTH],
236    )> = core_trace_contexts
237        .into_par_iter()
238        .map(|trace_state| {
239            let main_trace_generator = CoreTraceFragmentGenerator::new(trace_state, fragment_size);
240            main_trace_generator.generate_fragment()
241        })
242        .collect();
243
244    // Separate fragments, stack_rows, and system_rows
245    let mut fragments = Vec::new();
246    let mut stack_rows = Vec::new();
247    let mut system_rows = Vec::new();
248
249    for (fragment, stack_row, system_row) in fragment_results {
250        fragments.push(fragment);
251        stack_rows.push(stack_row);
252        system_rows.push(system_row);
253    }
254
255    // Fix up stack and system rows: first fragment gets initial state, others get values from
256    // previous fragment's rows TODO(plafer): Document why we need to do this (i.e. rows are
257    // written at row i+1)
258    fixup_stack_and_system_rows(
259        &mut fragments,
260        &stack_rows,
261        &system_rows,
262        &first_stack_top,
263        &first_system_state,
264    );
265
266    append_halt_opcode_row(
267        fragments.last_mut().expect("expected at least one trace fragment"),
268        system_rows.last().expect(
269            "system_rows should not be empty, which indicates that there are no trace fragments",
270        ),
271        stack_rows.last().expect(
272            "stack_rows should not be empty, which indicates that there are no trace fragments",
273        ),
274    );
275
276    fragments
277}
278
279/// Fixes up the stack and system rows in fragments by initializing the first row of each fragment
280/// with the appropriate stack and system state.
281fn fixup_stack_and_system_rows(
282    fragments: &mut [CoreTraceFragment],
283    stack_rows: &[[Felt; STACK_TRACE_WIDTH]],
284    system_rows: &[[Felt; SYS_TRACE_WIDTH]],
285    first_stack_top: &[Felt],
286    first_system_state: &[Felt; SYS_TRACE_WIDTH],
287) {
288    const MIN_STACK_DEPTH_FELT: Felt = Felt::new(MIN_STACK_DEPTH as u64);
289
290    if fragments.is_empty() {
291        return;
292    }
293
294    // Initialize the first fragment with first_stack_top + [16, 0, 0] and first_system_state
295    if let Some(first_fragment) = fragments.first_mut() {
296        // Set system state (8 columns)
297        for (col_idx, &value) in first_system_state.iter().enumerate() {
298            first_fragment.columns[col_idx][0] = value;
299        }
300
301        // Set stack top (16 elements)
302        // Note: we call `rev()` here because the stack order is reversed in the trace.
303        // trace: [top, ..., bottom] vs stack: [bottom, ..., top]
304        for (stack_col_idx, &value) in first_stack_top.iter().rev().enumerate() {
305            first_fragment.columns[STACK_TRACE_OFFSET + STACK_TOP_OFFSET + stack_col_idx][0] =
306                value;
307        }
308
309        // Set stack helpers: [16, 0, 0]
310        first_fragment.columns[STACK_TRACE_OFFSET + B0_COL_IDX][0] = MIN_STACK_DEPTH_FELT;
311        first_fragment.columns[STACK_TRACE_OFFSET + B1_COL_IDX][0] = ZERO;
312        first_fragment.columns[STACK_TRACE_OFFSET + H0_COL_IDX][0] = ZERO;
313    }
314
315    // Initialize subsequent fragments with their corresponding rows from the previous fragment
316    // TODO(plafer): use zip
317    for (i, fragment) in fragments.iter_mut().enumerate().skip(1) {
318        if fragment.row_count() > 0 && i - 1 < stack_rows.len() && i - 1 < system_rows.len() {
319            // Copy the system_row to the first row of this fragment
320            let system_row = &system_rows[i - 1];
321            for (col_idx, &value) in system_row.iter().enumerate() {
322                fragment.columns[col_idx][0] = value;
323            }
324
325            // Copy the stack_row to the first row of this fragment
326            let stack_row = &stack_rows[i - 1];
327            for (col_idx, &value) in stack_row.iter().enumerate() {
328                fragment.columns[STACK_TRACE_OFFSET + col_idx][0] = value;
329            }
330        }
331    }
332}
333
334/// Appends a row with the HALT opcode to the end of the last fragment.
335///
336/// This ensures that the trace ends with at least one HALT operation, which is necessary to satisfy
337/// the constraints.
338fn append_halt_opcode_row(
339    last_fragment: &mut CoreTraceFragment,
340    last_system_state: &[Felt; SYS_TRACE_WIDTH],
341    last_stack_state: &[Felt; STACK_TRACE_WIDTH],
342) {
343    // system columns
344    // ---------------------------------------------------------------------------------------
345    for (col_idx, &value) in last_system_state.iter().enumerate() {
346        last_fragment.columns[col_idx].push(value);
347    }
348
349    // stack columns
350    // ---------------------------------------------------------------------------------------
351    for (col_idx, &value) in last_stack_state.iter().enumerate() {
352        last_fragment.columns[STACK_TRACE_OFFSET + col_idx].push(value);
353    }
354
355    // decoder columns: padding with final decoder state
356    // ---------------------------------------------------------------------------------------
357    // Pad addr trace (decoder block address column) with ZEROs
358    last_fragment.columns[DECODER_TRACE_OFFSET + ADDR_COL_IDX].push(ZERO);
359
360    // Pad op_bits columns with HALT opcode bits
361    let halt_opcode = Operation::Halt.op_code();
362    for bit_idx in 0..NUM_OP_BITS {
363        let bit_value = Felt::from((halt_opcode >> bit_idx) & 1);
364        last_fragment.columns[DECODER_TRACE_OFFSET + OP_BITS_OFFSET + bit_idx].push(bit_value);
365    }
366
367    // Pad hasher state columns (8 columns)
368    // - First 4 columns: copy the last value (to propagate program hash)
369    // - Remaining 4 columns: fill with ZEROs
370    for hasher_col_idx in 0..NUM_HASHER_COLUMNS {
371        let col_idx = DECODER_TRACE_OFFSET + HASHER_STATE_OFFSET + hasher_col_idx;
372        if hasher_col_idx < 4 {
373            // For first 4 hasher columns, copy the last value to propagate program hash
374            let last_row_idx = last_fragment.columns[col_idx].len() - 1;
375            let last_hasher_value = last_fragment.columns[col_idx][last_row_idx];
376            last_fragment.columns[col_idx].push(last_hasher_value);
377        } else {
378            // For remaining 4 hasher columns, fill with ZEROs
379            last_fragment.columns[col_idx].push(ZERO);
380        }
381    }
382
383    // Pad in_span column with ZEROs
384    last_fragment.columns[DECODER_TRACE_OFFSET + IN_SPAN_COL_IDX].push(ZERO);
385
386    // Pad group_count column with ZEROs
387    last_fragment.columns[DECODER_TRACE_OFFSET + GROUP_COUNT_COL_IDX].push(ZERO);
388
389    // Pad op_idx column with ZEROs
390    last_fragment.columns[DECODER_TRACE_OFFSET + OP_INDEX_COL_IDX].push(ZERO);
391
392    // Pad op_batch_flags columns (3 columns) with ZEROs
393    for batch_flag_idx in 0..NUM_OP_BATCH_FLAGS {
394        let col_idx = DECODER_TRACE_OFFSET + OP_BATCH_FLAGS_OFFSET + batch_flag_idx;
395        last_fragment.columns[col_idx].push(ZERO);
396    }
397
398    // Pad op_bit_extra columns (2 columns)
399    // - First column: fill with ZEROs (HALT doesn't use this)
400    // - Second column: fill with ONEs (product of two most significant HALT bits, both are 1)
401    last_fragment.columns[DECODER_TRACE_OFFSET + OP_BITS_EXTRA_COLS_OFFSET].push(ZERO);
402    last_fragment.columns[DECODER_TRACE_OFFSET + OP_BITS_EXTRA_COLS_OFFSET + 1].push(ONE);
403}
404
405fn initialize_range_checker(
406    range_checker_replay: crate::fast::trace_state::RangeCheckerReplay,
407    chiplets: &Chiplets,
408) -> RangeChecker {
409    let mut range_checker = RangeChecker::new();
410
411    // Add all u32 range checks recorded during execution
412    for (clk, values) in range_checker_replay.into_iter() {
413        range_checker.add_range_checks(clk, &values);
414    }
415
416    // Add all memory-related range checks
417    chiplets.append_range_checks(&mut range_checker);
418
419    range_checker
420}
421
422fn initialize_chiplets(
423    kernel: Kernel,
424    core_trace_contexts: &[CoreTraceFragmentContext],
425    memory_writes: MemoryWritesReplay,
426    bitwise: BitwiseReplay,
427    kernel_replay: KernelReplay,
428    hasher_for_chiplet: HasherRequestReplay,
429    ace_replay: AceReplay,
430) -> Chiplets {
431    let mut chiplets = Chiplets::new(kernel);
432
433    // populate hasher chiplet
434    for hasher_op in hasher_for_chiplet.into_iter() {
435        match hasher_op {
436            HasherOp::Permute(input_state) => {
437                chiplets.hasher.permute(input_state);
438            },
439            HasherOp::HashControlBlock((h1, h2, domain, expected_hash)) => {
440                chiplets.hasher.hash_control_block(h1, h2, domain, expected_hash);
441            },
442            HasherOp::HashBasicBlock((op_batches, expected_hash)) => {
443                chiplets.hasher.hash_basic_block(&op_batches, expected_hash);
444            },
445            HasherOp::BuildMerkleRoot((value, path, index)) => {
446                chiplets.hasher.build_merkle_root(value, &path, index);
447            },
448            HasherOp::UpdateMerkleRoot((old_value, new_value, path, index)) => {
449                chiplets.hasher.update_merkle_root(old_value, new_value, &path, index);
450            },
451        }
452    }
453
454    // populate bitwise chiplet
455    for (bitwise_op, a, b) in bitwise {
456        match bitwise_op {
457            BitwiseOp::U32And => {
458                chiplets
459                    .bitwise
460                    .u32and(a, b, &())
461                    .expect("bitwise AND operation failed when populating chiplet");
462            },
463            BitwiseOp::U32Xor => {
464                chiplets
465                    .bitwise
466                    .u32xor(a, b, &())
467                    .expect("bitwise XOR operation failed when populating chiplet");
468            },
469        }
470    }
471
472    // populate memory chiplet
473    //
474    // Note: care is taken to order all the accesses by clock cycle, since the memory chiplet
475    // currently assumes that all memory accesses are issued in the same order as they appear in
476    // the trace.
477    {
478        let elements_written: Box<dyn Iterator<Item = MemoryAccess>> =
479            Box::new(memory_writes.iter_elements_written().map(|(element, addr, ctx, clk)| {
480                MemoryAccess::WriteElement(*addr, *element, *ctx, *clk)
481            }));
482        let words_written: Box<dyn Iterator<Item = MemoryAccess>> = Box::new(
483            memory_writes
484                .iter_words_written()
485                .map(|(word, addr, ctx, clk)| MemoryAccess::WriteWord(*addr, *word, *ctx, *clk)),
486        );
487        let elements_read: Box<dyn Iterator<Item = MemoryAccess>> =
488            Box::new(core_trace_contexts.iter().flat_map(|ctx| {
489                ctx.replay
490                    .memory_reads
491                    .iter_read_elements()
492                    .map(|(_, addr, ctx, clk)| MemoryAccess::ReadElement(addr, ctx, clk))
493            }));
494        let words_read: Box<dyn Iterator<Item = MemoryAccess>> =
495            Box::new(core_trace_contexts.iter().flat_map(|ctx| {
496                ctx.replay
497                    .memory_reads
498                    .iter_read_words()
499                    .map(|(_, addr, ctx, clk)| MemoryAccess::ReadWord(addr, ctx, clk))
500            }));
501
502        [elements_written, words_written, elements_read, words_read]
503            .into_iter()
504            .kmerge_by(|a, b| a.clk() < b.clk())
505            .for_each(|mem_access| match mem_access {
506                MemoryAccess::ReadElement(addr, ctx, clk) => {
507                    chiplets
508                        .memory
509                        .read(ctx, addr, clk, &())
510                        .expect("memory read element failed when populating chiplet");
511                },
512                MemoryAccess::WriteElement(addr, element, ctx, clk) => {
513                    chiplets
514                        .memory
515                        .write(ctx, addr, clk, element, &())
516                        .expect("memory write element failed when populating chiplet");
517                },
518                MemoryAccess::ReadWord(addr, ctx, clk) => {
519                    chiplets
520                        .memory
521                        .read_word(ctx, addr, clk, &())
522                        .expect("memory read word failed when populating chiplet");
523                },
524                MemoryAccess::WriteWord(addr, word, ctx, clk) => {
525                    chiplets
526                        .memory
527                        .write_word(ctx, addr, clk, word, &())
528                        .expect("memory write word failed when populating chiplet");
529                },
530            });
531
532        enum MemoryAccess {
533            ReadElement(Felt, ContextId, RowIndex),
534            WriteElement(Felt, Felt, ContextId, RowIndex),
535            ReadWord(Felt, ContextId, RowIndex),
536            WriteWord(Felt, Word, ContextId, RowIndex),
537        }
538
539        impl MemoryAccess {
540            fn clk(&self) -> RowIndex {
541                match self {
542                    MemoryAccess::ReadElement(_, _, clk) => *clk,
543                    MemoryAccess::WriteElement(_, _, _, clk) => *clk,
544                    MemoryAccess::ReadWord(_, _, clk) => *clk,
545                    MemoryAccess::WriteWord(_, _, _, clk) => *clk,
546                }
547            }
548        }
549    }
550
551    // populate ACE chiplet
552    for (clk, circuit_eval) in ace_replay.into_iter() {
553        chiplets.ace.add_circuit_evaluation(clk, circuit_eval);
554    }
555
556    // populate kernel ROM
557    for proc_hash in kernel_replay.into_iter() {
558        chiplets
559            .kernel_rom
560            .access_proc(proc_hash, &())
561            .expect("kernel proc access failed when populating chiplet");
562    }
563
564    chiplets
565}
566
567/// Combines multiple CoreTraceFragments into core trace columns
568fn combine_fragments(fragments: Vec<CoreTraceFragment>, trace_len: usize) -> Vec<Vec<Felt>> {
569    if fragments.is_empty() {
570        panic!("Cannot combine empty fragments vector");
571    }
572
573    // Calculate total number of rows from fragments
574    let total_program_rows: usize = fragments.iter().map(|f| f.row_count()).sum();
575
576    // Initialize columns for the core trace only using uninitialized memory
577    let mut trace_columns: Vec<Box<[MaybeUninit<Felt>]>> =
578        (0..CORE_TRACE_WIDTH).map(|_| Box::new_uninit_slice(trace_len)).collect();
579
580    // Copy core trace columns from fragments
581    let mut current_row_idx = 0;
582    for fragment in fragments {
583        let fragment_rows = fragment.row_count();
584
585        for local_row_idx in 0..fragment_rows {
586            let global_row_idx = current_row_idx + local_row_idx;
587
588            // Copy core trace columns (system, decoder, stack)
589            for (col_idx, trace_column) in trace_columns.iter_mut().enumerate() {
590                trace_column[global_row_idx].write(fragment.columns[col_idx][local_row_idx]);
591            }
592        }
593
594        current_row_idx += fragment_rows;
595    }
596
597    // Pad the remaining rows (between total_program_rows and trace_len)
598    pad_trace_columns(&mut trace_columns, total_program_rows, trace_len);
599
600    // Convert uninitialized columns to initialized Vec<Felt>
601    let mut core_trace_columns: Vec<Vec<Felt>> = trace_columns
602        .into_iter()
603        .map(|uninit_column| {
604            // Safety: All elements have been initialized through MaybeUninit::write()
605            let init_column = unsafe { uninit_column.assume_init() };
606            Vec::from(init_column)
607        })
608        .collect();
609
610    // Run batch inversion on stack's H0 helper column
611    core_trace_columns[STACK_TRACE_OFFSET + H0_COL_IDX] =
612        batch_inversion(&core_trace_columns[STACK_TRACE_OFFSET + H0_COL_IDX]);
613
614    // Return the core trace columns
615    core_trace_columns
616}
617
618/// Pads the trace columns from `total_program_rows` rows to `trace_len` rows.
619///
620/// # Safety
621/// - This function assumes that the first `total_program_rows` rows of the trace columns are
622///   already initialized.
623///
624/// # Panics
625/// - If `total_program_rows` is zero.
626/// - If `total_program_rows + NUM_RAND_ROWS - 1 > trace_len`.
627fn pad_trace_columns(
628    trace_columns: &mut [Box<[MaybeUninit<Felt>]>],
629    total_program_rows: usize,
630    trace_len: usize,
631) {
632    // TODO(plafer): parallelize this function
633    assert_ne!(total_program_rows, 0);
634    assert!(total_program_rows + NUM_RAND_ROWS - 1 <= trace_len);
635
636    // System columns
637    // ------------------------
638
639    // Pad CLK trace - fill with index values
640    for (clk_val, clk_row) in
641        trace_columns[CLK_COL_IDX].iter_mut().enumerate().skip(total_program_rows)
642    {
643        clk_row.write(Felt::from(clk_val as u32));
644    }
645
646    // Pad CTX trace - fill with ZEROs (root context)
647    for ctx_row in trace_columns[CTX_COL_IDX].iter_mut().skip(total_program_rows) {
648        ctx_row.write(ZERO);
649    }
650
651    // Pad FN_HASH traces (4 columns) - fill with ZEROs as program execution must always end in the
652    // root context.
653    for fn_hash_col_idx in FN_HASH_RANGE {
654        for fn_hash_row in trace_columns[fn_hash_col_idx].iter_mut().skip(total_program_rows) {
655            fn_hash_row.write(ZERO);
656        }
657    }
658
659    // Decoder columns
660    // ------------------------
661
662    // Pad addr trace (decoder block address column) with ZEROs
663    for addr_row in trace_columns[DECODER_TRACE_OFFSET + ADDR_COL_IDX]
664        .iter_mut()
665        .skip(total_program_rows)
666    {
667        addr_row.write(ZERO);
668    }
669
670    // Pad op_bits columns with HALT opcode bits
671    let halt_opcode = Operation::Halt.op_code();
672    for i in 0..NUM_OP_BITS {
673        let bit_value = Felt::from((halt_opcode >> i) & 1);
674        for op_bit_row in trace_columns[DECODER_TRACE_OFFSET + OP_BITS_OFFSET + i]
675            .iter_mut()
676            .skip(total_program_rows)
677        {
678            op_bit_row.write(bit_value);
679        }
680    }
681
682    // Pad hasher state columns (8 columns)
683    // - First 4 columns: copy the last value (to propagate program hash)
684    // - Remaining 4 columns: fill with ZEROs
685    for i in 0..NUM_HASHER_COLUMNS {
686        let col_idx = DECODER_TRACE_OFFSET + HASHER_STATE_OFFSET + i;
687        if i < 4 {
688            // For first 4 hasher columns, copy the last value to propagate program hash
689            // Safety: per our documented safety guarantees, we know that `total_program_rows > 0`,
690            // and row `total_program_rows - 1` is initialized.
691            let last_hasher_value =
692                unsafe { trace_columns[col_idx][total_program_rows - 1].assume_init() };
693            for hasher_row in trace_columns[col_idx].iter_mut().skip(total_program_rows) {
694                hasher_row.write(last_hasher_value);
695            }
696        } else {
697            // For remaining 4 hasher columns, fill with ZEROs
698            for hasher_row in trace_columns[col_idx].iter_mut().skip(total_program_rows) {
699                hasher_row.write(ZERO);
700            }
701        }
702    }
703
704    // Pad in_span column with ZEROs
705    for in_span_row in trace_columns[DECODER_TRACE_OFFSET + IN_SPAN_COL_IDX]
706        .iter_mut()
707        .skip(total_program_rows)
708    {
709        in_span_row.write(ZERO);
710    }
711
712    // Pad group_count column with ZEROs
713    for group_count_row in trace_columns[DECODER_TRACE_OFFSET + GROUP_COUNT_COL_IDX]
714        .iter_mut()
715        .skip(total_program_rows)
716    {
717        group_count_row.write(ZERO);
718    }
719
720    // Pad op_idx column with ZEROs
721    for op_idx_row in trace_columns[DECODER_TRACE_OFFSET + OP_INDEX_COL_IDX]
722        .iter_mut()
723        .skip(total_program_rows)
724    {
725        op_idx_row.write(ZERO);
726    }
727
728    // Pad op_batch_flags columns (3 columns) with ZEROs
729    for i in 0..NUM_OP_BATCH_FLAGS {
730        let col_idx = DECODER_TRACE_OFFSET + OP_BATCH_FLAGS_OFFSET + i;
731        for op_batch_flag_row in trace_columns[col_idx].iter_mut().skip(total_program_rows) {
732            op_batch_flag_row.write(ZERO);
733        }
734    }
735
736    // Pad op_bit_extra columns (2 columns)
737    // - First column: fill with ZEROs (HALT doesn't use this)
738    // - Second column: fill with ONEs (product of two most significant HALT bits, both are 1)
739    for op_bit_extra_row in trace_columns[DECODER_TRACE_OFFSET + OP_BITS_EXTRA_COLS_OFFSET]
740        .iter_mut()
741        .skip(total_program_rows)
742    {
743        op_bit_extra_row.write(ZERO);
744    }
745    for op_bit_extra_row in trace_columns[DECODER_TRACE_OFFSET + OP_BITS_EXTRA_COLS_OFFSET + 1]
746        .iter_mut()
747        .skip(total_program_rows)
748    {
749        op_bit_extra_row.write(ONE);
750    }
751
752    // Stack columns
753    // ------------------------
754
755    // Pad stack columns with the last value in each column (analogous to Stack::into_trace())
756    for i in 0..STACK_TRACE_WIDTH {
757        let col_idx = STACK_TRACE_OFFSET + i;
758        // Safety: per our documented safety guarantees, we know that `total_program_rows > 0`,
759        // and row `total_program_rows - 1` is initialized.
760        let last_stack_value =
761            unsafe { trace_columns[col_idx][total_program_rows - 1].assume_init() };
762        for stack_row in trace_columns[col_idx].iter_mut().skip(total_program_rows) {
763            stack_row.write(last_stack_value);
764        }
765    }
766}
767
768// CORE TRACE FRAGMENT
769// ================================================================================================
770
771/// The columns of the main trace fragment. These consist of the system, decoder, and stack columns.
772///
773/// A fragment is a collection of columns of length `fragment_size` or less. Only a
774/// fragment containing a `HALT` operation is allowed to be shorter than
775/// `fragment_size`.
776struct CoreTraceFragment {
777    pub columns: [Vec<Felt>; CORE_TRACE_WIDTH],
778}
779
780impl CoreTraceFragment {
781    /// Creates a new CoreTraceFragment with *uninitialized* columns of length `num_rows`.
782    ///
783    /// # Safety
784    /// The caller is responsible for ensuring that the columns are properly initialized
785    /// before use.
786    pub unsafe fn new_uninit(num_rows: usize) -> Self {
787        Self {
788            // TODO(plafer): Don't use uninit_vector
789            columns: core::array::from_fn(|_| unsafe { uninit_vector(num_rows) }),
790        }
791    }
792
793    /// Returns the number of rows in this fragment
794    pub fn row_count(&self) -> usize {
795        self.columns[0].len()
796    }
797}
798
799struct CoreTraceFragmentGenerator {
800    fragment_start_clk: RowIndex,
801    fragment: CoreTraceFragment,
802    context: CoreTraceFragmentContext,
803    span_context: Option<BasicBlockContext>,
804    stack_rows: Option<[Felt; STACK_TRACE_WIDTH]>,
805    system_rows: Option<[Felt; SYS_TRACE_WIDTH]>,
806    fragment_size: usize,
807}
808
809impl CoreTraceFragmentGenerator {
810    /// Creates a new CoreTraceFragmentGenerator with the provided checkpoint.
811    pub fn new(context: CoreTraceFragmentContext, fragment_size: usize) -> Self {
812        Self {
813            fragment_start_clk: context.state.system.clk,
814            // Safety: the `CoreTraceFragmentGenerator` will fill in all the rows, or truncate any
815            // unused rows if a `HALT` operation occurs before `fragment_size` have
816            // been executed.
817            fragment: unsafe { CoreTraceFragment::new_uninit(fragment_size) },
818            context,
819            span_context: None,
820            stack_rows: None,
821            system_rows: None,
822            fragment_size,
823        }
824    }
825
826    /// Processes a single checkpoint into a CoreTraceFragment
827    pub fn generate_fragment(
828        mut self,
829    ) -> (CoreTraceFragment, [Felt; STACK_TRACE_WIDTH], [Felt; SYS_TRACE_WIDTH]) {
830        let execution_state = self.context.initial_execution_state.clone();
831        // Execute fragment generation and always finalize at the end
832        let _ = self.execute_fragment_generation(execution_state);
833        let final_stack_rows = self.stack_rows.unwrap_or([ZERO; STACK_TRACE_WIDTH]);
834        let final_system_rows = self.system_rows.unwrap_or([ZERO; SYS_TRACE_WIDTH]);
835        let fragment = self.finalize_fragment();
836        (fragment, final_stack_rows, final_system_rows)
837    }
838
839    /// Internal method that performs fragment generation with automatic early returns
840    fn execute_fragment_generation(
841        &mut self,
842        execution_state: NodeExecutionState,
843    ) -> ControlFlow<()> {
844        let initial_mast_forest = self.context.initial_mast_forest.clone();
845
846        // Finish the current node given its execution state
847        match execution_state {
848            NodeExecutionState::BasicBlock { node_id, batch_index, op_idx_in_batch } => {
849                let basic_block_node = {
850                    let mast_node =
851                        initial_mast_forest.get_node_by_id(node_id).expect("node should exist");
852                    mast_node.get_basic_block().expect("Expected a basic block node")
853                };
854
855                let op_batches = basic_block_node.op_batches();
856                assert!(
857                    batch_index < op_batches.len(),
858                    "Batch index out of bounds: {batch_index} >= {}",
859                    op_batches.len()
860                );
861
862                // Initialize the span context for the current basic block
863                self.span_context =
864                    Some(initialize_span_context(basic_block_node, batch_index, op_idx_in_batch));
865
866                // Execute remaining operations in the specified batch
867                {
868                    let current_batch = &op_batches[batch_index];
869                    self.execute_op_batch(
870                        current_batch,
871                        Some(op_idx_in_batch),
872                        &initial_mast_forest,
873                    )?;
874                }
875
876                // Execute remaining batches
877                for op_batch in op_batches.iter().skip(batch_index + 1) {
878                    self.respan(op_batch)?;
879
880                    self.execute_op_batch(op_batch, None, &initial_mast_forest)?;
881                }
882
883                // Add END trace row to complete the basic block
884                self.add_span_end_trace_row(basic_block_node)?;
885            },
886            NodeExecutionState::Start(node_id) => {
887                self.execute_mast_node(node_id, &initial_mast_forest)?;
888            },
889            NodeExecutionState::Respan { node_id, batch_index } => {
890                let basic_block_node = {
891                    let mast_node =
892                        initial_mast_forest.get_node_by_id(node_id).expect("node should exist");
893                    mast_node.get_basic_block().expect("Expected a basic block node")
894                };
895                let current_batch = &basic_block_node.op_batches()[batch_index];
896
897                self.span_context = {
898                    let current_op_group = current_batch.groups()[0];
899                    let num_groups_left: usize = basic_block_node
900                        .op_batches()
901                        .iter()
902                        .skip(batch_index)
903                        .map(|batch| batch.num_groups())
904                        .sum();
905
906                    Some(BasicBlockContext {
907                        group_ops_left: current_op_group,
908                        num_groups_left: Felt::new(num_groups_left as u64),
909                    })
910                };
911
912                // Execute remaining batches
913                for op_batch in basic_block_node.op_batches().iter().skip(batch_index) {
914                    self.respan(op_batch)?;
915
916                    self.execute_op_batch(op_batch, None, &initial_mast_forest)?;
917                }
918
919                // Add END trace row to complete the basic block
920                self.add_span_end_trace_row(basic_block_node)?;
921            },
922            NodeExecutionState::LoopRepeat(node_id) => {
923                // TODO(plafer): merge with the `Continuation::FinishLoop` case
924                let loop_node = initial_mast_forest
925                    .get_node_by_id(node_id)
926                    .expect("node should exist")
927                    .unwrap_loop();
928
929                let mut condition = self.get(0);
930                self.decrement_size(&mut NoopTracer);
931
932                while condition == ONE {
933                    self.add_loop_repeat_trace_row(
934                        loop_node,
935                        &initial_mast_forest,
936                        self.context.state.decoder.current_addr,
937                    )?;
938
939                    self.execute_mast_node(loop_node.body(), &initial_mast_forest)?;
940
941                    condition = self.get(0);
942                    self.decrement_size(&mut NoopTracer);
943                }
944
945                // 3. Add "end LOOP" row
946                //
947                // Note that we don't confirm that the condition is properly ZERO here, as
948                // the FastProcessor already ran that check.
949                self.add_end_trace_row(loop_node.digest())?;
950            },
951            // TODO(plafer): there's a big overlap between `NodeExecutionPhase::End` and
952            // `Continuation::Finish*`. We can probably reconcile.
953            NodeExecutionState::End(node_id) => {
954                let mast_node =
955                    initial_mast_forest.get_node_by_id(node_id).expect("node should exist");
956
957                match mast_node {
958                    MastNode::Join(join_node) => {
959                        self.add_end_trace_row(join_node.digest())?;
960                    },
961                    MastNode::Split(split_node) => {
962                        self.add_end_trace_row(split_node.digest())?;
963                    },
964                    MastNode::Loop(loop_node) => {
965                        let (ended_node_addr, flags) = self.update_decoder_state_on_node_end();
966
967                        // If the loop was entered, we need to shift the stack to the left
968                        if flags.loop_entered() == ONE {
969                            self.decrement_size(&mut NoopTracer);
970                        }
971
972                        self.add_end_trace_row_impl(loop_node.digest(), flags, ended_node_addr)?;
973                    },
974                    MastNode::Call(call_node) => {
975                        let ctx_info = self.context.replay.block_stack.replay_execution_context();
976                        self.restore_context_from_replay(&ctx_info);
977                        self.add_end_trace_row(call_node.digest())?;
978                    },
979                    MastNode::Dyn(dyn_node) => {
980                        if dyn_node.is_dyncall() {
981                            let ctx_info =
982                                self.context.replay.block_stack.replay_execution_context();
983                            self.restore_context_from_replay(&ctx_info);
984                        }
985                        self.add_end_trace_row(dyn_node.digest())?;
986                    },
987                    MastNode::Block(basic_block_node) => {
988                        self.add_span_end_trace_row(basic_block_node)?;
989                    },
990                    MastNode::External(_external_node) => {
991                        // External nodes don't generate trace rows directly, and hence will never
992                        // show up in the END execution state.
993                        panic!("Unexpected external node in END execution state")
994                    },
995                }
996            },
997        }
998
999        // Start of main execution loop.
1000        let mut current_forest = self.context.initial_mast_forest.clone();
1001
1002        while let Some(continuation) = self.context.continuation.pop_continuation() {
1003            match continuation {
1004                Continuation::StartNode(node_id) => {
1005                    self.execute_mast_node(node_id, &current_forest)?;
1006                },
1007                Continuation::FinishJoin(node_id) => {
1008                    let mast_node =
1009                        current_forest.get_node_by_id(node_id).expect("node should exist");
1010                    self.add_end_trace_row(mast_node.digest())?;
1011                },
1012                Continuation::FinishSplit(node_id) => {
1013                    let mast_node =
1014                        current_forest.get_node_by_id(node_id).expect("node should exist");
1015                    self.add_end_trace_row(mast_node.digest())?;
1016                },
1017                Continuation::FinishLoop(node_id) => {
1018                    let loop_node = current_forest
1019                        .get_node_by_id(node_id)
1020                        .expect("node should exist")
1021                        .unwrap_loop();
1022
1023                    let mut condition = self.get(0);
1024                    self.decrement_size(&mut NoopTracer);
1025
1026                    while condition == ONE {
1027                        self.add_loop_repeat_trace_row(
1028                            loop_node,
1029                            &current_forest,
1030                            self.context.state.decoder.current_addr,
1031                        )?;
1032
1033                        self.execute_mast_node(loop_node.body(), &current_forest)?;
1034
1035                        condition = self.get(0);
1036                        self.decrement_size(&mut NoopTracer);
1037                    }
1038
1039                    // 3. Add "end LOOP" row
1040                    //
1041                    // Note that we don't confirm that the condition is properly ZERO here, as
1042                    // the FastProcessor already ran that check.
1043                    self.add_end_trace_row(loop_node.digest())?;
1044                },
1045                Continuation::FinishCall(node_id) => {
1046                    let mast_node =
1047                        current_forest.get_node_by_id(node_id).expect("node should exist");
1048
1049                    // Restore context
1050                    let ctx_info = self.context.replay.block_stack.replay_execution_context();
1051                    self.restore_context_from_replay(&ctx_info);
1052
1053                    // write END row to trace
1054                    self.add_end_trace_row(mast_node.digest())?;
1055                },
1056                Continuation::FinishDyn(node_id) => {
1057                    let dyn_node = current_forest
1058                        .get_node_by_id(node_id)
1059                        .expect("node should exist")
1060                        .unwrap_dyn();
1061                    if dyn_node.is_dyncall() {
1062                        let ctx_info = self.context.replay.block_stack.replay_execution_context();
1063                        self.restore_context_from_replay(&ctx_info);
1064                    }
1065
1066                    self.add_end_trace_row(dyn_node.digest())?;
1067                },
1068                Continuation::FinishExternal(_node_id) => {
1069                    // Execute after_exit decorators when returning from an external node
1070                    // Note: current_forest should already be restored by EnterForest continuation
1071                    // External nodes don't generate END trace rows in the parallel processor
1072                    // as they only execute after_exit decorators
1073                },
1074                Continuation::EnterForest(previous_forest) => {
1075                    // Restore the previous forest
1076                    current_forest = previous_forest;
1077                },
1078            }
1079        }
1080
1081        // All nodes completed without filling the fragment
1082        ControlFlow::Continue(())
1083    }
1084
1085    fn execute_mast_node(
1086        &mut self,
1087        node_id: MastNodeId,
1088        current_forest: &MastForest,
1089    ) -> ControlFlow<()> {
1090        let mast_node = current_forest.get_node_by_id(node_id).expect("node should exist");
1091
1092        match mast_node {
1093            MastNode::Block(basic_block_node) => {
1094                self.update_decoder_state_on_node_start();
1095
1096                let num_groups_left_in_block = Felt::from(basic_block_node.num_op_groups() as u32);
1097                let first_op_batch = basic_block_node
1098                    .op_batches()
1099                    .first()
1100                    .expect("Basic block should have at least one op batch");
1101
1102                // 1. Add SPAN start trace row
1103                self.add_span_start_trace_row(
1104                    first_op_batch,
1105                    num_groups_left_in_block,
1106                    // TODO(plafer): remove as parameter? (and same for all other start methods)
1107                    self.context.state.decoder.parent_addr,
1108                )?;
1109
1110                // Initialize the span context for the current basic block. After SPAN operation is
1111                // executed, we decrement the number of remaining groups by 1 because executing
1112                // SPAN consumes the first group of the batch.
1113                // TODO(plafer): use `initialize_span_context` once the potential off-by-one issue
1114                // is resolved.
1115                self.span_context = Some(BasicBlockContext {
1116                    group_ops_left: first_op_batch.groups()[0],
1117                    num_groups_left: num_groups_left_in_block - ONE,
1118                });
1119
1120                // 2. Execute batches one by one
1121                let op_batches = basic_block_node.op_batches();
1122
1123                // Execute first op batch
1124                {
1125                    let first_op_batch =
1126                        op_batches.first().expect("Basic block should have at least one op batch");
1127                    self.execute_op_batch(first_op_batch, None, current_forest)?;
1128                }
1129
1130                // Execute the rest of the op batches
1131                for op_batch in op_batches.iter().skip(1) {
1132                    // 3. Add RESPAN trace row between batches
1133                    self.respan(op_batch)?;
1134
1135                    self.execute_op_batch(op_batch, None, current_forest)?;
1136                }
1137
1138                // 4. Add END trace row
1139                self.add_span_end_trace_row(basic_block_node)?;
1140
1141                ControlFlow::Continue(())
1142            },
1143            MastNode::Join(join_node) => {
1144                self.update_decoder_state_on_node_start();
1145
1146                // 1. Add "start JOIN" row
1147                self.add_join_start_trace_row(
1148                    join_node,
1149                    current_forest,
1150                    self.context.state.decoder.parent_addr,
1151                )?;
1152
1153                // 2. Execute first child
1154                self.execute_mast_node(join_node.first(), current_forest)?;
1155
1156                // 3. Execute second child
1157                self.execute_mast_node(join_node.second(), current_forest)?;
1158
1159                // 4. Add "end JOIN" row
1160                self.add_end_trace_row(join_node.digest())
1161            },
1162            MastNode::Split(split_node) => {
1163                self.update_decoder_state_on_node_start();
1164
1165                let condition = self.get(0);
1166                self.decrement_size(&mut NoopTracer);
1167
1168                // 1. Add "start SPLIT" row
1169                self.add_split_start_trace_row(
1170                    split_node,
1171                    current_forest,
1172                    self.context.state.decoder.parent_addr,
1173                )?;
1174
1175                // 2. Execute the appropriate branch based on the stack top value
1176                if condition == ONE {
1177                    self.execute_mast_node(split_node.on_true(), current_forest)?;
1178                } else {
1179                    self.execute_mast_node(split_node.on_false(), current_forest)?;
1180                }
1181
1182                // 3. Add "end SPLIT" row
1183                self.add_end_trace_row(split_node.digest())
1184            },
1185            MastNode::Loop(loop_node) => {
1186                self.update_decoder_state_on_node_start();
1187
1188                // Read condition from the stack and decrement stack size. This happens as part of
1189                // the LOOP operation, and so is done before writing that trace row.
1190                let mut condition = self.get(0);
1191                self.decrement_size(&mut NoopTracer);
1192
1193                // 1. Add "start LOOP" row
1194                self.add_loop_start_trace_row(
1195                    loop_node,
1196                    current_forest,
1197                    self.context.state.decoder.parent_addr,
1198                )?;
1199
1200                // 2. Loop while condition is true
1201                //
1202                // The first iteration is special because it doesn't insert a REPEAT trace row
1203                // before executing the loop body. Therefore it is done separately.
1204                if condition == ONE {
1205                    self.execute_mast_node(loop_node.body(), current_forest)?;
1206
1207                    condition = self.get(0);
1208                    self.decrement_size(&mut NoopTracer);
1209                }
1210
1211                while condition == ONE {
1212                    self.add_loop_repeat_trace_row(
1213                        loop_node,
1214                        current_forest,
1215                        self.context.state.decoder.current_addr,
1216                    )?;
1217
1218                    self.execute_mast_node(loop_node.body(), current_forest)?;
1219
1220                    condition = self.get(0);
1221                    self.decrement_size(&mut NoopTracer);
1222                }
1223
1224                // 3. Add "end LOOP" row
1225                //
1226                // Note that we don't confirm that the condition is properly ZERO here, as the
1227                // FastProcessor already ran that check.
1228                self.add_end_trace_row(loop_node.digest())
1229            },
1230            MastNode::Call(call_node) => {
1231                self.update_decoder_state_on_node_start();
1232
1233                self.context.state.stack.start_context();
1234
1235                // Set up new context for the call
1236                if call_node.is_syscall() {
1237                    self.context.state.system.ctx = ContextId::root(); // Root context for syscalls
1238                } else {
1239                    self.context.state.system.ctx =
1240                        ContextId::from(self.context.state.system.clk + 1); // New context ID
1241                    self.context.state.system.fn_hash = current_forest[call_node.callee()].digest();
1242                }
1243
1244                // Add "start CALL/SYSCALL" row
1245                self.add_call_start_trace_row(
1246                    call_node,
1247                    current_forest,
1248                    self.context.state.decoder.parent_addr,
1249                )?;
1250
1251                // Execute the callee
1252                self.execute_mast_node(call_node.callee(), current_forest)?;
1253
1254                // Restore context state
1255                let ctx_info = self.context.replay.block_stack.replay_execution_context();
1256                self.restore_context_from_replay(&ctx_info);
1257
1258                // 2. Add "end CALL/SYSCALL" row
1259                self.add_end_trace_row(call_node.digest())
1260            },
1261            MastNode::Dyn(dyn_node) => {
1262                self.update_decoder_state_on_node_start();
1263
1264                let callee_hash = {
1265                    let mem_addr = self.context.state.stack.get(0);
1266                    self.context.replay.memory_reads.replay_read_word(mem_addr)
1267                };
1268
1269                // 1. Add "start DYN/DYNCALL" row
1270                if dyn_node.is_dyncall() {
1271                    let (stack_depth, next_overflow_addr) =
1272                        self.shift_stack_left_and_start_context();
1273                    // For DYNCALL, we need to save the current context state
1274                    // and prepare for dynamic execution
1275                    let ctx_info = ExecutionContextInfo::new(
1276                        self.context.state.system.ctx,
1277                        self.context.state.system.fn_hash,
1278                        stack_depth as u32,
1279                        next_overflow_addr,
1280                    );
1281
1282                    self.context.state.system.ctx =
1283                        ContextId::from(self.context.state.system.clk + 1); // New context ID
1284                    self.context.state.system.fn_hash = callee_hash;
1285
1286                    self.add_dyncall_start_trace_row(
1287                        self.context.state.decoder.parent_addr,
1288                        callee_hash,
1289                        ctx_info,
1290                    )?;
1291                } else {
1292                    // Pop the memory address off the stack, and write the DYN trace row
1293                    self.decrement_size(&mut NoopTracer);
1294                    self.add_dyn_start_trace_row(
1295                        self.context.state.decoder.parent_addr,
1296                        callee_hash,
1297                    )?;
1298                };
1299
1300                // 2. Execute the callee
1301                match current_forest.find_procedure_root(callee_hash) {
1302                    Some(callee_id) => self.execute_mast_node(callee_id, current_forest)?,
1303                    None => {
1304                        let (resolved_node_id, resolved_forest) =
1305                            self.context.replay.mast_forest_resolution.replay_resolution();
1306
1307                        self.execute_mast_node(resolved_node_id, &resolved_forest)?
1308                    },
1309                };
1310
1311                // Restore context state for DYNCALL
1312                if dyn_node.is_dyncall() {
1313                    let ctx_info = self.context.replay.block_stack.replay_execution_context();
1314                    self.restore_context_from_replay(&ctx_info);
1315                }
1316
1317                // 3. Add "end DYN/DYNCALL" row
1318                self.add_end_trace_row(dyn_node.digest())
1319            },
1320            MastNode::External(_) => {
1321                let (resolved_node_id, resolved_forest) =
1322                    self.context.replay.mast_forest_resolution.replay_resolution();
1323
1324                self.execute_mast_node(resolved_node_id, &resolved_forest)
1325            },
1326        }
1327    }
1328
1329    // TODO(plafer): clean this up; it's probably not clear from the `TraceFragmentContext` that
1330    // this is the relationship between the replays and the `DecoderState` (only made clear when
1331    // looking at this function)
1332    /// This function is called when start executing a node (e.g. `JOIN`, `SPLIT`, etc). It emulates
1333    /// pushing a new node onto the block stack, and updates the decoder state to point to the
1334    /// current node in the block stack (which could be renamed to "node stack"). Hence, the
1335    /// `current_addr` is set to the (replayed) address of the current node, and the `parent_addr`
1336    /// is set to the (replayed) address of the parent node (i.e. the node previously on top of the
1337    /// block stack).
1338    fn update_decoder_state_on_node_start(&mut self) {
1339        let DecoderState { current_addr, parent_addr } = &mut self.context.state.decoder;
1340
1341        *current_addr = self.context.replay.hasher.replay_block_address();
1342        *parent_addr = self.context.replay.block_stack.replay_node_start();
1343    }
1344
1345    /// This function is called when we hit an `END` operation, signaling the end of execution for a
1346    /// node. It updates the decoder state to point to the previous node in the block stack (which
1347    /// could be renamed to "node stack"), and returns the address of the node that just ended,
1348    /// along with any flags associated with it.
1349    fn update_decoder_state_on_node_end(&mut self) -> (Felt, NodeFlags) {
1350        let DecoderState { current_addr, parent_addr } = &mut self.context.state.decoder;
1351
1352        let node_end_data = self.context.replay.block_stack.replay_node_end();
1353
1354        *current_addr = node_end_data.prev_addr;
1355        *parent_addr = node_end_data.prev_parent_addr;
1356
1357        (node_end_data.ended_node_addr, node_end_data.flags)
1358    }
1359
1360    /// Restores the execution context to the state it was in before the last `call`, `syscall` or
1361    /// `dyncall`.
1362    ///
1363    /// This includes restoring the overflow stack and the system parameters.
1364    fn restore_context_from_replay(&mut self, ctx_info: &ExecutionContextSystemInfo) {
1365        self.context.state.system.ctx = ctx_info.parent_ctx;
1366        self.context.state.system.fn_hash = ctx_info.parent_fn_hash;
1367
1368        self.context
1369            .state
1370            .stack
1371            .restore_context(&mut self.context.replay.stack_overflow);
1372    }
1373
1374    /// Executes operations within an operation batch, analogous to FastProcessor::execute_op_batch.
1375    ///
1376    /// If `start_op_idx` is provided, execution begins from that operation index within the batch.
1377    fn execute_op_batch(
1378        &mut self,
1379        batch: &OpBatch,
1380        start_op_idx: Option<usize>,
1381        current_forest: &MastForest,
1382    ) -> ControlFlow<()> {
1383        let start_op_idx = start_op_idx.unwrap_or(0);
1384        let end_indices = batch.end_indices();
1385
1386        // Execute operations in the batch starting from the correct static operation index
1387        for (op_idx_in_batch, (op_group_idx, op_idx_in_group, op)) in
1388            batch.iter_with_groups().enumerate().skip(start_op_idx)
1389        {
1390            {
1391                // `execute_sync_op` does not support executing `Emit`, so we only call it for all
1392                // other operations.
1393                let user_op_helpers = if let Operation::Emit = op {
1394                    None
1395                } else {
1396                    // Note that the `op_idx_in_block` is only used in case of error, so we set it
1397                    // to 0.
1398                    // TODO(plafer): remove op_idx_in_block from u32_sub's error?
1399                    self.execute_sync_op(
1400                        op,
1401                        0,
1402                        current_forest,
1403                        &mut NoopHost,
1404                        &(),
1405                        &mut NoopTracer,
1406                    )
1407                    // The assumption here is that the computation was done by the FastProcessor,
1408                    // and so all operations in the program are valid and can be executed
1409                    // successfully.
1410                    .expect("operation should execute successfully")
1411                };
1412
1413                // write the operation to the trace
1414                self.add_operation_trace_row(*op, op_idx_in_group, user_op_helpers)?;
1415            }
1416
1417            // if we executed all operations in a group and haven't reached the end of the batch
1418            // yet, set up the decoder for decoding the next operation group
1419            if op_idx_in_batch + 1 == end_indices[op_group_idx]
1420                && let Some(next_op_group_idx) = batch.next_op_group_index(op_group_idx)
1421            {
1422                self.start_op_group(batch.groups()[next_op_group_idx]);
1423            }
1424        }
1425
1426        ControlFlow::Continue(())
1427    }
1428
1429    /// Starts decoding a new operation group.
1430    pub fn start_op_group(&mut self, op_group: Felt) {
1431        let ctx = self.span_context.as_mut().expect("not in span");
1432
1433        // reset the current group value and decrement the number of left groups by ONE
1434        debug_assert_eq!(ZERO, ctx.group_ops_left, "not all ops executed in current group");
1435        ctx.group_ops_left = op_group;
1436        ctx.num_groups_left -= ONE;
1437    }
1438
1439    /// Finalizes and returns the built fragment, truncating any unused rows if necessary.
1440    fn finalize_fragment(mut self) -> CoreTraceFragment {
1441        debug_assert!(
1442            self.num_rows_built() <= self.fragment_size,
1443            "built too many rows: {} > {}",
1444            self.num_rows_built(),
1445            self.fragment_size
1446        );
1447
1448        // If we have not built enough rows, we need to truncate the fragment. This would occur only
1449        // in the last fragment of a trace, if we encountered a HALT operation before reaching
1450        // `fragment_size`.
1451        let num_rows = core::cmp::min(self.num_rows_built(), self.fragment_size);
1452        for column in &mut self.fragment.columns {
1453            column.truncate(num_rows);
1454        }
1455
1456        self.fragment
1457    }
1458
1459    // HELPERS
1460    // -------------------------------------------------------------------------------------------
1461
1462    // TODO(plafer): Should this be a `StackState` method?
1463    pub fn shift_stack_left_and_start_context(&mut self) -> (usize, Felt) {
1464        self.decrement_size(&mut NoopTracer);
1465        self.context.state.stack.start_context()
1466    }
1467
1468    fn append_opcode(&mut self, opcode: u8, row_idx: usize) {
1469        use miden_air::trace::{
1470            DECODER_TRACE_OFFSET,
1471            decoder::{NUM_OP_BITS, OP_BITS_OFFSET},
1472        };
1473
1474        // Append the opcode bits to the trace row
1475        let bits: [Felt; NUM_OP_BITS] = core::array::from_fn(|i| Felt::from((opcode >> i) & 1));
1476        for (i, bit) in bits.iter().enumerate() {
1477            self.fragment.columns[DECODER_TRACE_OFFSET + OP_BITS_OFFSET + i][row_idx] = *bit;
1478        }
1479
1480        // Set extra bit columns for degree reduction
1481        self.fragment.columns[DECODER_TRACE_OFFSET + OP_BITS_EXTRA_COLS_OFFSET][row_idx] =
1482            bits[6] * (ONE - bits[5]) * bits[4];
1483        self.fragment.columns[DECODER_TRACE_OFFSET + OP_BITS_EXTRA_COLS_OFFSET + 1][row_idx] =
1484            bits[6] * bits[5];
1485    }
1486
1487    fn done_generating(&mut self) -> bool {
1488        // If we have built all the rows in the fragment, we are done
1489        self.num_rows_built() >= self.fragment_size
1490    }
1491
1492    fn num_rows_built(&self) -> usize {
1493        // Returns the number of rows built so far in the fragment
1494        self.context.state.system.clk - self.fragment_start_clk
1495    }
1496
1497    fn increment_clk(&mut self) -> ControlFlow<()> {
1498        self.context.state.system.clk += 1u32;
1499
1500        // Check if we have reached the maximum number of rows in the fragment
1501        if self.done_generating() {
1502            // If we have reached the maximum, we are done generating
1503            ControlFlow::Break(())
1504        } else {
1505            // Otherwise, we continue generating
1506            ControlFlow::Continue(())
1507        }
1508    }
1509}
1510
1511// HELPERS
1512// ===============================================================================================
1513
1514/// Given that a trace fragment can start executing from the middle of a basic block, we need to
1515/// initialize the `BasicBlockContext` correctly to reflect the state of the decoder at that point.
1516/// This function does that initialization.
1517///
1518/// Recall that `BasicBlockContext` keeps track of the state needed to correctly fill in the decoder
1519/// columns associated with a SPAN of operations (i.e. a basic block). This function takes in a
1520/// basic block node, the index of the current operation batch within that block, and the index of
1521/// the current operation within that batch, and initializes the `BasicBlockContext` accordingly. In
1522/// other words, it figures out how many operations are left in the current operation group, and how
1523/// many operation groups are left in the basic block, given that we are starting execution from the
1524/// specified operation.
1525fn initialize_span_context(
1526    basic_block_node: &BasicBlockNode,
1527    batch_index: usize,
1528    op_idx_in_batch: usize,
1529) -> BasicBlockContext {
1530    let op_batches = basic_block_node.op_batches();
1531    let (current_op_group_idx, op_idx_in_group) = op_batches[batch_index]
1532        .op_idx_in_batch_to_group(op_idx_in_batch)
1533        .expect("invalid batch");
1534
1535    let group_ops_left = {
1536        // Note: this here relies on NOOP's opcode to be 0, since `current_op_group_idx` could point
1537        // to an op group that contains a NOOP inserted at runtime (i.e. padding at the end of the
1538        // batch), and hence not encoded in the basic block directly. But since NOOP's opcode is 0,
1539        // this works out correctly (since empty groups are also represented by 0).
1540        let current_op_group = op_batches[batch_index].groups()[current_op_group_idx];
1541
1542        // Shift out all operations that are already executed in this group.
1543        //
1544        // Note: `group_ops_left` encodes the bits of the operations left to be executed after the
1545        // current one, and so we would expect to shift `NUM_OP_BITS` by `op_idx_in_group + 1`.
1546        // However, we will apply that shift right before writing to the trace, so we only shift by
1547        // `op_idx_in_group` here.
1548        Felt::new(current_op_group.as_int() >> (NUM_OP_BITS * op_idx_in_group))
1549    };
1550
1551    let num_groups_left = {
1552        // TODO(plafer): do we have to look at all op groups in the block? Can't we just look at the
1553        // current batch's groups?
1554        let total_groups = basic_block_node.num_op_groups();
1555
1556        // Count groups consumed by completed batches (all batches before current one).
1557        let mut groups_consumed = 0;
1558        for op_batch in op_batches.iter().take(batch_index) {
1559            groups_consumed += op_batch.num_groups().next_power_of_two();
1560        }
1561
1562        // We run through previous operations of our current op group, and increment the number of
1563        // groups consumed for each operation that has an immediate value
1564        {
1565            // Note: This is a hacky way of doing this because `OpBatch` doesn't store the
1566            // information of which operation belongs to which group.
1567            let mut current_op_group =
1568                op_batches[batch_index].groups()[current_op_group_idx].as_int();
1569            for _ in 0..op_idx_in_group {
1570                let current_op = (current_op_group & 0b1111111) as u8;
1571                if current_op == OPCODE_PUSH {
1572                    groups_consumed += 1;
1573                }
1574
1575                current_op_group >>= NUM_OP_BITS; // Shift to the next operation in the group
1576            }
1577        }
1578
1579        // Add the number of complete groups before the current group in this batch. Add 1 to
1580        // account for the current group (since `num_groups_left` is the number of groups left
1581        // *after* being done with the current group)
1582        groups_consumed += current_op_group_idx + 1;
1583
1584        Felt::from((total_groups - groups_consumed) as u32)
1585    };
1586
1587    BasicBlockContext { group_ops_left, num_groups_left }
1588}
1589
1590// REQUIRED METHODS
1591// ===============================================================================================
1592
1593impl StackInterface for CoreTraceFragmentGenerator {
1594    fn top(&self) -> &[Felt] {
1595        &self.context.state.stack.stack_top
1596    }
1597
1598    fn top_mut(&mut self) -> &mut [Felt] {
1599        &mut self.context.state.stack.stack_top
1600    }
1601
1602    fn get(&self, idx: usize) -> Felt {
1603        debug_assert!(idx < MIN_STACK_DEPTH);
1604        self.context.state.stack.stack_top[MIN_STACK_DEPTH - idx - 1]
1605    }
1606
1607    fn get_mut(&mut self, idx: usize) -> &mut Felt {
1608        debug_assert!(idx < MIN_STACK_DEPTH);
1609
1610        &mut self.context.state.stack.stack_top[MIN_STACK_DEPTH - idx - 1]
1611    }
1612
1613    fn get_word(&self, start_idx: usize) -> Word {
1614        debug_assert!(start_idx < MIN_STACK_DEPTH - 4);
1615
1616        let word_start_idx = MIN_STACK_DEPTH - start_idx - 4;
1617        self.top()[range(word_start_idx, WORD_SIZE)].try_into().unwrap()
1618    }
1619
1620    fn depth(&self) -> u32 {
1621        (MIN_STACK_DEPTH + self.context.state.stack.num_overflow_elements_in_current_ctx()) as u32
1622    }
1623
1624    fn set(&mut self, idx: usize, element: Felt) {
1625        *self.get_mut(idx) = element;
1626    }
1627
1628    fn set_word(&mut self, start_idx: usize, word: &Word) {
1629        debug_assert!(start_idx < MIN_STACK_DEPTH - 4);
1630        let word_start_idx = MIN_STACK_DEPTH - start_idx - 4;
1631
1632        let word_on_stack =
1633            &mut self.context.state.stack.stack_top[range(word_start_idx, WORD_SIZE)];
1634        word_on_stack.copy_from_slice(word.as_slice());
1635    }
1636
1637    fn swap(&mut self, idx1: usize, idx2: usize) {
1638        let a = self.get(idx1);
1639        let b = self.get(idx2);
1640        self.set(idx1, b);
1641        self.set(idx2, a);
1642    }
1643
1644    fn swapw_nth(&mut self, n: usize) {
1645        // For example, for n=3, the stack words and variables look like:
1646        //    3     2     1     0
1647        // | ... | ... | ... | ... |
1648        // ^                 ^
1649        // nth_word       top_word
1650        let (rest_of_stack, top_word) =
1651            self.context.state.stack.stack_top.split_at_mut(MIN_STACK_DEPTH - WORD_SIZE);
1652        let (_, nth_word) = rest_of_stack.split_at_mut(rest_of_stack.len() - n * WORD_SIZE);
1653
1654        nth_word[0..WORD_SIZE].swap_with_slice(&mut top_word[0..WORD_SIZE]);
1655    }
1656
1657    fn rotate_left(&mut self, n: usize) {
1658        let rotation_bot_index = MIN_STACK_DEPTH - n;
1659        let new_stack_top_element = self.context.state.stack.stack_top[rotation_bot_index];
1660
1661        // shift the top n elements down by 1, starting from the bottom of the rotation.
1662        for i in 0..n - 1 {
1663            self.context.state.stack.stack_top[rotation_bot_index + i] =
1664                self.context.state.stack.stack_top[rotation_bot_index + i + 1];
1665        }
1666
1667        // Set the top element (which comes from the bottom of the rotation).
1668        self.set(0, new_stack_top_element);
1669    }
1670
1671    fn rotate_right(&mut self, n: usize) {
1672        let rotation_bot_index = MIN_STACK_DEPTH - n;
1673        let new_stack_bot_element = self.context.state.stack.stack_top[MIN_STACK_DEPTH - 1];
1674
1675        // shift the top n elements up by 1, starting from the top of the rotation.
1676        for i in 1..n {
1677            self.context.state.stack.stack_top[MIN_STACK_DEPTH - i] =
1678                self.context.state.stack.stack_top[MIN_STACK_DEPTH - i - 1];
1679        }
1680
1681        // Set the bot element (which comes from the top of the rotation).
1682        self.context.state.stack.stack_top[rotation_bot_index] = new_stack_bot_element;
1683    }
1684
1685    fn increment_size(&mut self, _tracer: &mut impl Tracer) -> Result<(), ExecutionError> {
1686        const SENTINEL_VALUE: Felt = Felt::new(Felt::MODULUS - 1);
1687
1688        // push the last element on the overflow table
1689        {
1690            let last_element = self.get(MIN_STACK_DEPTH - 1);
1691            self.context.state.stack.push_overflow(last_element, self.clk());
1692        }
1693
1694        // Shift all other elements down
1695        for write_idx in (1..MIN_STACK_DEPTH).rev() {
1696            let read_idx = write_idx - 1;
1697            self.set(write_idx, self.get(read_idx));
1698        }
1699
1700        // Set the top element to SENTINEL_VALUE to help in debugging. Per the method docs, this
1701        // value will be overwritten
1702        self.set(0, SENTINEL_VALUE);
1703
1704        Ok(())
1705    }
1706
1707    fn decrement_size(&mut self, _tracer: &mut impl Tracer) {
1708        // Shift all other elements up
1709        for write_idx in 0..(MIN_STACK_DEPTH - 1) {
1710            let read_idx = write_idx + 1;
1711            self.set(write_idx, self.get(read_idx));
1712        }
1713
1714        // Pop the last element from the overflow table
1715        if let Some(last_element) =
1716            self.context.state.stack.pop_overflow(&mut self.context.replay.stack_overflow)
1717        {
1718            // Write the last element to the bottom of the stack
1719            self.set(MIN_STACK_DEPTH - 1, last_element);
1720        } else {
1721            // If overflow table is empty, set the bottom element to zero
1722            self.set(MIN_STACK_DEPTH - 1, ZERO);
1723        }
1724    }
1725}
1726
1727impl Processor for CoreTraceFragmentGenerator {
1728    type HelperRegisters = TraceGenerationHelpers;
1729    type System = Self;
1730    type Stack = Self;
1731    type AdviceProvider = AdviceReplay;
1732    type Memory = MemoryReadsReplay;
1733    type Hasher = HasherResponseReplay;
1734
1735    fn stack(&mut self) -> &mut Self::Stack {
1736        self
1737    }
1738
1739    fn system(&mut self) -> &mut Self::System {
1740        self
1741    }
1742
1743    fn state(&mut self) -> ProcessState<'_> {
1744        ProcessState::Noop(())
1745    }
1746
1747    fn advice_provider(&mut self) -> &mut Self::AdviceProvider {
1748        &mut self.context.replay.advice
1749    }
1750
1751    fn memory(&mut self) -> &mut Self::Memory {
1752        &mut self.context.replay.memory_reads
1753    }
1754
1755    fn hasher(&mut self) -> &mut Self::Hasher {
1756        &mut self.context.replay.hasher
1757    }
1758
1759    fn precompile_transcript_state(&self) -> PrecompileTranscriptState {
1760        self.context.state.system.pc_transcript_state
1761    }
1762
1763    fn set_precompile_transcript_state(&mut self, state: PrecompileTranscriptState) {
1764        self.context.state.system.pc_transcript_state = state;
1765    }
1766
1767    fn op_eval_circuit(
1768        &mut self,
1769        err_ctx: &impl ErrorContext,
1770        tracer: &mut impl Tracer,
1771    ) -> Result<(), ExecutionError> {
1772        let num_eval = self.stack().get(2);
1773        let num_read = self.stack().get(1);
1774        let ptr = self.stack().get(0);
1775        let ctx = self.system().ctx();
1776
1777        let _circuit_evaluation = eval_circuit_fast_(
1778            ctx,
1779            ptr,
1780            self.system().clk(),
1781            num_read,
1782            num_eval,
1783            self,
1784            err_ctx,
1785            tracer,
1786        )?;
1787
1788        Ok(())
1789    }
1790}
1791
1792impl SystemInterface for CoreTraceFragmentGenerator {
1793    fn caller_hash(&self) -> Word {
1794        self.context.state.system.fn_hash
1795    }
1796
1797    fn clk(&self) -> RowIndex {
1798        self.context.state.system.clk
1799    }
1800
1801    fn ctx(&self) -> ContextId {
1802        self.context.state.system.ctx
1803    }
1804}
1805
1806/// Implementation of `OperationHelperRegisters` used for trace generation, where we actually
1807/// compute the helper registers associated with the corresponding operation.
1808struct TraceGenerationHelpers;
1809
1810impl OperationHelperRegisters for TraceGenerationHelpers {
1811    #[inline(always)]
1812    fn op_eq_registers(stack_second: Felt, stack_first: Felt) -> [Felt; NUM_USER_OP_HELPERS] {
1813        let h0 = if stack_second == stack_first {
1814            ZERO
1815        } else {
1816            (stack_first - stack_second).inv()
1817        };
1818
1819        [h0, ZERO, ZERO, ZERO, ZERO, ZERO]
1820    }
1821
1822    #[inline(always)]
1823    fn op_u32split_registers(hi: Felt, lo: Felt) -> [Felt; NUM_USER_OP_HELPERS] {
1824        let (t1, t0) = split_u32_into_u16(lo.as_int());
1825        let (t3, t2) = split_u32_into_u16(hi.as_int());
1826        let m = (Felt::from(u32::MAX) - hi).inv();
1827
1828        [Felt::from(t0), Felt::from(t1), Felt::from(t2), Felt::from(t3), m, ZERO]
1829    }
1830
1831    #[inline(always)]
1832    fn op_eqz_registers(top: Felt) -> [Felt; NUM_USER_OP_HELPERS] {
1833        // h0 is a helper variable provided by the prover. If the top element is zero, then, h0 can
1834        // be set to anything otherwise set it to the inverse of the top element in the stack.
1835        let h0 = if top == ZERO { ZERO } else { top.inv() };
1836
1837        [h0, ZERO, ZERO, ZERO, ZERO, ZERO]
1838    }
1839
1840    #[inline(always)]
1841    fn op_expacc_registers(acc_update_val: Felt) -> [Felt; NUM_USER_OP_HELPERS] {
1842        [acc_update_val, ZERO, ZERO, ZERO, ZERO, ZERO]
1843    }
1844
1845    #[inline(always)]
1846    fn op_fri_ext2fold4_registers(
1847        ev: QuadFelt,
1848        es: QuadFelt,
1849        x: Felt,
1850        x_inv: Felt,
1851    ) -> [Felt; NUM_USER_OP_HELPERS] {
1852        let ev_arr = [ev];
1853        let ev_felts = QuadFelt::slice_as_base_elements(&ev_arr);
1854
1855        let es_arr = [es];
1856        let es_felts = QuadFelt::slice_as_base_elements(&es_arr);
1857
1858        [ev_felts[0], ev_felts[1], es_felts[0], es_felts[1], x, x_inv]
1859    }
1860
1861    #[inline(always)]
1862    fn op_u32add_registers(hi: Felt, lo: Felt) -> [Felt; NUM_USER_OP_HELPERS] {
1863        // Compute helpers for range checks
1864        let (t1, t0) = split_u32_into_u16(lo.as_int());
1865        let (t3, t2) = split_u32_into_u16(hi.as_int());
1866
1867        // For u32add, check_element_validity is false
1868        [Felt::from(t0), Felt::from(t1), Felt::from(t2), Felt::from(t3), ZERO, ZERO]
1869    }
1870
1871    #[inline(always)]
1872    fn op_u32add3_registers(hi: Felt, lo: Felt) -> [Felt; NUM_USER_OP_HELPERS] {
1873        // Compute helpers for range checks
1874        let (t1, t0) = split_u32_into_u16(lo.as_int());
1875        let (t3, t2) = split_u32_into_u16(hi.as_int());
1876
1877        [Felt::from(t0), Felt::from(t1), Felt::from(t2), Felt::from(t3), ZERO, ZERO]
1878    }
1879
1880    #[inline(always)]
1881    fn op_u32sub_registers(second_new: Felt) -> [Felt; NUM_USER_OP_HELPERS] {
1882        // Compute helpers for range checks (only `second_new` needs range checking)
1883        let (t1, t0) = split_u32_into_u16(second_new.as_int());
1884
1885        [Felt::from(t0), Felt::from(t1), ZERO, ZERO, ZERO, ZERO]
1886    }
1887
1888    #[inline(always)]
1889    fn op_u32mul_registers(hi: Felt, lo: Felt) -> [Felt; NUM_USER_OP_HELPERS] {
1890        // Compute helpers for range checks
1891        let (t1, t0) = split_u32_into_u16(lo.as_int());
1892        let (t3, t2) = split_u32_into_u16(hi.as_int());
1893        let m = (Felt::from(u32::MAX) - hi).inv();
1894
1895        [Felt::from(t0), Felt::from(t1), Felt::from(t2), Felt::from(t3), m, ZERO]
1896    }
1897
1898    #[inline(always)]
1899    fn op_u32madd_registers(hi: Felt, lo: Felt) -> [Felt; NUM_USER_OP_HELPERS] {
1900        // Compute helpers for range checks
1901        let (t1, t0) = split_u32_into_u16(lo.as_int());
1902        let (t3, t2) = split_u32_into_u16(hi.as_int());
1903        let m = (Felt::from(u32::MAX) - hi).inv();
1904
1905        [Felt::from(t0), Felt::from(t1), Felt::from(t2), Felt::from(t3), m, ZERO]
1906    }
1907
1908    #[inline(always)]
1909    fn op_u32div_registers(hi: Felt, lo: Felt) -> [Felt; NUM_USER_OP_HELPERS] {
1910        // Compute helpers for range checks
1911        let (t1, t0) = split_u32_into_u16(lo.as_int());
1912        let (t3, t2) = split_u32_into_u16(hi.as_int());
1913
1914        [Felt::from(t0), Felt::from(t1), Felt::from(t2), Felt::from(t3), ZERO, ZERO]
1915    }
1916
1917    #[inline(always)]
1918    fn op_u32assert2_registers(first: Felt, second: Felt) -> [Felt; NUM_USER_OP_HELPERS] {
1919        // Compute helpers for range checks for both operands
1920        let (t1, t0) = split_u32_into_u16(second.as_int());
1921        let (t3, t2) = split_u32_into_u16(first.as_int());
1922
1923        [Felt::from(t0), Felt::from(t1), Felt::from(t2), Felt::from(t3), ZERO, ZERO]
1924    }
1925
1926    #[inline(always)]
1927    fn op_hperm_registers(addr: Felt) -> [Felt; NUM_USER_OP_HELPERS] {
1928        [addr, ZERO, ZERO, ZERO, ZERO, ZERO]
1929    }
1930
1931    #[inline(always)]
1932    fn op_merkle_path_registers(addr: Felt) -> [Felt; NUM_USER_OP_HELPERS] {
1933        [addr, ZERO, ZERO, ZERO, ZERO, ZERO]
1934    }
1935
1936    #[inline(always)]
1937    fn op_horner_eval_registers(
1938        alpha: QuadFelt,
1939        k0: Felt,
1940        k1: Felt,
1941        acc_tmp: QuadFelt,
1942    ) -> [Felt; NUM_USER_OP_HELPERS] {
1943        [
1944            alpha.base_element(0),
1945            alpha.base_element(1),
1946            k0,
1947            k1,
1948            acc_tmp.base_element(0),
1949            acc_tmp.base_element(1),
1950        ]
1951    }
1952
1953    #[inline(always)]
1954    fn op_log_precompile_registers(addr: Felt, cap_prev: Word) -> [Felt; NUM_USER_OP_HELPERS] {
1955        // Helper registers layout for log_precompile:
1956        // h0-h4 contain: [addr, CAP_PREV[0..3]]
1957        [addr, cap_prev[0], cap_prev[1], cap_prev[2], cap_prev[3], ZERO]
1958    }
1959}
1960
1961// HELPERS
1962// ================================================================================================
1963
1964// TODO(plafer): If we want to keep this strategy, then move the `op_eval_circuit()` method
1965// implementation to the `Processor` trait, and have `FastProcessor` and
1966// `CoreTraceFragmentGenerator` both use it.
1967/// Identical to `[chiplets::ace::eval_circuit]` but adapted for use with `[FastProcessor]`.
1968#[allow(clippy::too_many_arguments)]
1969fn eval_circuit_fast_(
1970    ctx: ContextId,
1971    ptr: Felt,
1972    clk: RowIndex,
1973    num_vars: Felt,
1974    num_eval: Felt,
1975    processor: &mut CoreTraceFragmentGenerator,
1976    err_ctx: &impl ErrorContext,
1977    tracer: &mut impl Tracer,
1978) -> Result<CircuitEvaluation, ExecutionError> {
1979    let num_vars = num_vars.as_int();
1980    let num_eval = num_eval.as_int();
1981
1982    let num_wires = num_vars + num_eval;
1983    if num_wires > MAX_NUM_ACE_WIRES as u64 {
1984        return Err(ExecutionError::failed_arithmetic_evaluation(
1985            err_ctx,
1986            AceError::TooManyWires(num_wires),
1987        ));
1988    }
1989
1990    // Ensure vars and instructions are word-aligned and non-empty. Note that variables are
1991    // quadratic extension field elements while instructions are encoded as base field elements.
1992    // Hence we can pack 2 variables and 4 instructions per word.
1993    if !num_vars.is_multiple_of(2) || num_vars == 0 {
1994        return Err(ExecutionError::failed_arithmetic_evaluation(
1995            err_ctx,
1996            AceError::NumVarIsNotWordAlignedOrIsEmpty(num_vars),
1997        ));
1998    }
1999    if !num_eval.is_multiple_of(4) || num_eval == 0 {
2000        return Err(ExecutionError::failed_arithmetic_evaluation(
2001            err_ctx,
2002            AceError::NumEvalIsNotWordAlignedOrIsEmpty(num_eval),
2003        ));
2004    }
2005
2006    // Ensure instructions are word-aligned and non-empty
2007    let num_read_rows = num_vars as u32 / 2;
2008    let num_eval_rows = num_eval as u32;
2009
2010    let mut evaluation_context = CircuitEvaluation::new(ctx, clk, num_read_rows, num_eval_rows);
2011
2012    let mut ptr = ptr;
2013    // perform READ operations
2014    // Note: we pass in a `NoopTracer`, because the parallel trace generation skips the circuit
2015    // evaluation completely
2016    for _ in 0..num_read_rows {
2017        let word = processor
2018            .memory()
2019            .read_word(ctx, ptr, clk, err_ctx)
2020            .map_err(ExecutionError::MemoryError)?;
2021        tracer.record_memory_read_word(word, ptr, ctx, clk);
2022        evaluation_context.do_read(ptr, word)?;
2023        ptr += PTR_OFFSET_WORD;
2024    }
2025    // perform EVAL operations
2026    for _ in 0..num_eval_rows {
2027        let instruction = processor
2028            .memory()
2029            .read_element(ctx, ptr, err_ctx)
2030            .map_err(ExecutionError::MemoryError)?;
2031        tracer.record_memory_read_element(instruction, ptr, ctx, clk);
2032        evaluation_context.do_eval(ptr, instruction, err_ctx)?;
2033        ptr += PTR_OFFSET_ELEM;
2034    }
2035
2036    // Ensure the circuit evaluated to zero.
2037    if !evaluation_context.output_value().is_some_and(|eval| eval == QuadFelt::ZERO) {
2038        return Err(ExecutionError::failed_arithmetic_evaluation(
2039            err_ctx,
2040            AceError::CircuitNotEvaluateZero,
2041        ));
2042    }
2043
2044    Ok(evaluation_context)
2045}