Skip to main content

miden_processor/trace/parallel/
mod.rs

1use alloc::{boxed::Box, sync::Arc, vec::Vec};
2
3use itertools::Itertools;
4use miden_air::{
5    Felt,
6    trace::{
7        CLK_COL_IDX, CTX_COL_IDX, DECODER_TRACE_OFFSET, DECODER_TRACE_WIDTH, FN_HASH_RANGE,
8        MIN_TRACE_LEN, MainTrace, PADDED_TRACE_WIDTH, RowIndex, STACK_TRACE_OFFSET,
9        STACK_TRACE_WIDTH, SYS_TRACE_WIDTH, TRACE_WIDTH,
10        decoder::{
11            ADDR_COL_IDX, GROUP_COUNT_COL_IDX, HASHER_STATE_OFFSET, IN_SPAN_COL_IDX,
12            NUM_HASHER_COLUMNS, NUM_OP_BATCH_FLAGS, NUM_OP_BITS, OP_BATCH_FLAGS_OFFSET,
13            OP_BITS_EXTRA_COLS_OFFSET, OP_BITS_OFFSET, OP_INDEX_COL_IDX,
14        },
15        stack::{B0_COL_IDX, B1_COL_IDX, H0_COL_IDX, STACK_TOP_OFFSET},
16    },
17};
18use miden_core::{
19    ONE, Word, ZERO,
20    field::batch_inversion_allow_zeros,
21    mast::{MastForest, MastNode},
22    operations::opcodes,
23    program::{Kernel, MIN_STACK_DEPTH, ProgramInfo},
24    utils::ColMatrix,
25};
26use rayon::prelude::*;
27use tracing::instrument;
28
29use crate::{
30    ContextId, ExecutionError,
31    continuation_stack::ContinuationStack,
32    errors::MapExecErrNoCtx,
33    fast::ExecutionOutput,
34    trace::{
35        AuxTraceBuilders, ChipletsLengths, ExecutionTrace, TraceLenSummary,
36        parallel::{processor::ReplayProcessor, tracer::CoreTraceGenerationTracer},
37        range::RangeChecker,
38    },
39};
40
41pub const CORE_TRACE_WIDTH: usize = SYS_TRACE_WIDTH + DECODER_TRACE_WIDTH + STACK_TRACE_WIDTH;
42
43/// `build_trace()` uses this as a hard cap on trace rows.
44///
45/// The code checks `core_trace_contexts.len() * fragment_size` before allocation. It checks the
46/// same cap again while replaying chiplet activity. This keeps memory use bounded.
47const MAX_TRACE_LEN: usize = 1 << 29;
48
49pub(crate) mod core_trace_fragment;
50use core_trace_fragment::CoreTraceFragment;
51
52mod processor;
53mod tracer;
54
55use super::{
56    chiplets::Chiplets,
57    decoder::AuxTraceBuilder as DecoderAuxTraceBuilder,
58    execution_tracer::TraceGenerationContext,
59    stack::AuxTraceBuilder as StackAuxTraceBuilder,
60    trace_state::{
61        AceReplay, BitwiseOp, BitwiseReplay, CoreTraceFragmentContext, CoreTraceState,
62        ExecutionReplay, HasherOp, HasherRequestReplay, KernelReplay, MemoryWritesReplay,
63        RangeCheckerReplay,
64    },
65};
66
67#[cfg(test)]
68mod tests;
69
70// BUILD TRACE
71// ================================================================================================
72
73/// Builds the main trace from the provided trace states in parallel.
74#[instrument(name = "build_trace", skip_all)]
75pub fn build_trace(
76    execution_output: ExecutionOutput,
77    trace_generation_context: TraceGenerationContext,
78    program_info: ProgramInfo,
79) -> Result<ExecutionTrace, ExecutionError> {
80    build_trace_with_max_len(
81        execution_output,
82        trace_generation_context,
83        program_info,
84        MAX_TRACE_LEN,
85    )
86}
87
88/// Same as [`build_trace`], but with a custom hard cap.
89///
90/// When the trace would go over `max_trace_len`, this returns
91/// [`ExecutionError::TraceLenExceeded`].
92pub fn build_trace_with_max_len(
93    execution_output: ExecutionOutput,
94    trace_generation_context: TraceGenerationContext,
95    program_info: ProgramInfo,
96    max_trace_len: usize,
97) -> Result<ExecutionTrace, ExecutionError> {
98    let TraceGenerationContext {
99        core_trace_contexts,
100        range_checker_replay,
101        memory_writes,
102        bitwise_replay: bitwise,
103        kernel_replay,
104        hasher_for_chiplet,
105        ace_replay,
106        fragment_size,
107    } = trace_generation_context;
108
109    // Before any trace generation, check that the number of core trace rows doesn't exceed the
110    // maximum trace length. This is a necessary check to avoid OOM panics during trace generation,
111    // which can occur if the execution produces an extremely large number of steps.
112    //
113    // Note that we add 1 to the total core trace rows to account for the additional HALT opcode row
114    // that is pushed at the end of the last fragment.
115    let total_core_trace_rows = core_trace_contexts
116        .len()
117        .checked_mul(fragment_size)
118        .and_then(|n| n.checked_add(1))
119        .ok_or(ExecutionError::TraceLenExceeded(max_trace_len))?;
120    if total_core_trace_rows > max_trace_len {
121        return Err(ExecutionError::TraceLenExceeded(max_trace_len));
122    }
123
124    if core_trace_contexts.is_empty() {
125        return Err(ExecutionError::Internal(
126            "no trace fragments provided in the trace generation context",
127        ));
128    }
129
130    let chiplets = initialize_chiplets(
131        program_info.kernel().clone(),
132        &core_trace_contexts,
133        memory_writes,
134        bitwise,
135        kernel_replay,
136        hasher_for_chiplet,
137        ace_replay,
138        max_trace_len,
139    )?;
140
141    let range_checker = initialize_range_checker(range_checker_replay, &chiplets);
142
143    let mut core_trace_columns = generate_core_trace_columns(
144        core_trace_contexts,
145        program_info.kernel().clone(),
146        fragment_size,
147    )?;
148
149    // Calculate trace length
150    let core_trace_len = core_trace_columns[0].len();
151
152    // Get the number of rows for the range checker
153    let range_table_len = range_checker.get_number_range_checker_rows();
154
155    let trace_len_summary =
156        TraceLenSummary::new(core_trace_len, range_table_len, ChipletsLengths::new(&chiplets));
157
158    // Compute the final main trace length
159    let main_trace_len =
160        compute_main_trace_length(core_trace_len, range_table_len, chiplets.trace_len());
161
162    let ((), (range_checker_trace, chiplets_trace)) = rayon::join(
163        || pad_trace_columns(&mut core_trace_columns, main_trace_len),
164        || {
165            rayon::join(
166                || range_checker.into_trace_with_table(range_table_len, main_trace_len),
167                || chiplets.into_trace(main_trace_len),
168            )
169        },
170    );
171
172    // Padding to make the number of columns a multiple of 8 i.e., the Poseidon2 permutation rate
173    let padding_columns = vec![vec![ZERO; main_trace_len]; PADDED_TRACE_WIDTH - TRACE_WIDTH];
174
175    // Chain all trace columns together
176    let trace_columns: Vec<Vec<Felt>> = core_trace_columns
177        .into_iter()
178        .chain(range_checker_trace.trace)
179        .chain(chiplets_trace.trace)
180        .chain(padding_columns)
181        .collect();
182
183    // Create the MainTrace
184    let main_trace = {
185        let last_program_row = RowIndex::from((core_trace_len as u32).saturating_sub(1));
186        let col_matrix = ColMatrix::new(trace_columns);
187        MainTrace::new(col_matrix, last_program_row)
188    };
189
190    // Create aux trace builders
191    let aux_trace_builders = AuxTraceBuilders {
192        decoder: DecoderAuxTraceBuilder::default(),
193        range: range_checker_trace.aux_builder,
194        chiplets: chiplets_trace.aux_builder,
195        stack: StackAuxTraceBuilder,
196    };
197
198    Ok(ExecutionTrace::new_from_parts(
199        program_info,
200        execution_output,
201        main_trace,
202        aux_trace_builders,
203        trace_len_summary,
204    ))
205}
206
207// HELPERS
208// ================================================================================================
209
210fn compute_main_trace_length(
211    core_trace_len: usize,
212    range_table_len: usize,
213    chiplets_trace_len: usize,
214) -> usize {
215    // Get the trace length required to hold all execution trace steps
216    let max_len = range_table_len.max(core_trace_len).max(chiplets_trace_len);
217
218    // Pad the trace length to the next power of two
219    let trace_len = max_len.next_power_of_two();
220    core::cmp::max(trace_len, MIN_TRACE_LEN)
221}
222
223/// Generates core trace fragments in parallel from the provided trace fragment contexts.
224fn generate_core_trace_columns(
225    core_trace_contexts: Vec<CoreTraceFragmentContext>,
226    kernel: Kernel,
227    fragment_size: usize,
228) -> Result<Vec<Vec<Felt>>, ExecutionError> {
229    let mut core_trace_columns: Vec<Vec<Felt>> =
230        vec![vec![ZERO; core_trace_contexts.len() * fragment_size]; CORE_TRACE_WIDTH];
231
232    // Save the first stack top for initialization
233    let first_stack_top = if let Some(first_context) = core_trace_contexts.first() {
234        first_context.state.stack.stack_top.to_vec()
235    } else {
236        vec![ZERO; MIN_STACK_DEPTH]
237    };
238
239    let mut fragments = create_fragments_from_trace_columns(&mut core_trace_columns, fragment_size);
240
241    // Build the core trace fragments in parallel
242    let fragment_results: Result<Vec<_>, ExecutionError> = core_trace_contexts
243        .into_par_iter()
244        .zip(fragments.par_iter_mut())
245        .map(|(trace_state, fragment)| {
246            let (mut processor, mut tracer, mut continuation_stack, mut current_forest) =
247                split_trace_fragment_context(trace_state, fragment, fragment_size);
248
249            processor.execute(
250                &mut continuation_stack,
251                &mut current_forest,
252                &kernel,
253                &mut tracer,
254            )?;
255
256            tracer.into_final_state()
257        })
258        .collect();
259    let fragment_results = fragment_results?;
260
261    // Separate fragments, stack_rows, and system_rows
262    let mut stack_rows = Vec::new();
263    let mut system_rows = Vec::new();
264    let mut total_core_trace_rows = 0;
265
266    for final_state in fragment_results {
267        stack_rows.push(final_state.last_stack_cols);
268        system_rows.push(final_state.last_system_cols);
269        total_core_trace_rows += final_state.num_rows_written;
270    }
271
272    // Fix up stack and system rows
273    fixup_stack_and_system_rows(
274        &mut core_trace_columns,
275        fragment_size,
276        &stack_rows,
277        &system_rows,
278        &first_stack_top,
279    );
280
281    // Run batch inversion on stack's H0 helper column, processing each fragment in parallel.
282    // This must be done after fixup_stack_and_system_rows since that function overwrites the first
283    // row of each fragment with non-inverted values.
284    {
285        let h0_column = &mut core_trace_columns[STACK_TRACE_OFFSET + H0_COL_IDX];
286        h0_column[..total_core_trace_rows]
287            .par_chunks_mut(fragment_size)
288            .for_each(batch_inversion_allow_zeros);
289    }
290
291    // Truncate the core trace columns to the actual number of rows written.
292    for col in core_trace_columns.iter_mut() {
293        col.truncate(total_core_trace_rows);
294    }
295
296    push_halt_opcode_row(
297        &mut core_trace_columns,
298        system_rows.last().ok_or(ExecutionError::Internal(
299            "no trace fragments provided in the trace generation context",
300        ))?,
301        stack_rows.last().ok_or(ExecutionError::Internal(
302            "no trace fragments provided in the trace generation context",
303        ))?,
304    );
305
306    Ok(core_trace_columns)
307}
308
309/// Splits the core trace columns into fragments of the specified size, returning a vector of
310/// `CoreTraceFragment`s that each borrow from the original columns.
311fn create_fragments_from_trace_columns(
312    core_trace_columns: &mut [Vec<Felt>],
313    fragment_size: usize,
314) -> Vec<CoreTraceFragment<'_>> {
315    let mut column_chunks: Vec<_> = core_trace_columns
316        .iter_mut()
317        .map(|col| col.chunks_exact_mut(fragment_size))
318        .collect();
319    let mut core_trace_fragments = Vec::new();
320
321    loop {
322        let fragment_cols: Vec<&mut [Felt]> =
323            column_chunks.iter_mut().filter_map(|col_chunk| col_chunk.next()).collect();
324        assert!(
325            fragment_cols.is_empty() || fragment_cols.len() == CORE_TRACE_WIDTH,
326            "column chunks don't all have the same size"
327        );
328
329        if fragment_cols.is_empty() {
330            return core_trace_fragments;
331        } else {
332            core_trace_fragments.push(CoreTraceFragment {
333                columns: fragment_cols.try_into().expect("fragment has CORE_TRACE_WIDTH columns"),
334            });
335        }
336    }
337}
338
339/// Initializing the first row of each fragment with the appropriate stack and system state.
340///
341/// This needs to be done as a separate pass after all fragments have been generated, because the
342/// system and stack rows write the state at clk `i` to the row at index `i+1`. Hence, the state of
343/// the last row of any given fragment cannot be written in parallel, since any given fragment
344/// filler doesn't have access to the next fragment's first row.
345fn fixup_stack_and_system_rows(
346    core_trace_columns: &mut [Vec<Felt>],
347    fragment_size: usize,
348    stack_rows: &[[Felt; STACK_TRACE_WIDTH]],
349    system_rows: &[[Felt; SYS_TRACE_WIDTH]],
350    first_stack_top: &[Felt],
351) {
352    const MIN_STACK_DEPTH_FELT: Felt = Felt::new(MIN_STACK_DEPTH as u64);
353
354    let system_state_first_row = [
355        ZERO, // clk starts at 0
356        ZERO, // ctx starts at 0 (root context)
357        ZERO, // fn_hash[0] starts as 0
358        ZERO, // fn_hash[1] starts as 0
359        ZERO, // fn_hash[2] starts as 0
360        ZERO, // fn_hash[3] starts as 0
361    ];
362
363    // Initialize the first fragment with first_stack_top + [16, 0, 0] and first_system_state
364    {
365        // Set system state
366        for (col_idx, &value) in system_state_first_row.iter().enumerate() {
367            core_trace_columns[col_idx][0] = value;
368        }
369
370        // Set stack top (16 elements)
371        // Note: we call `rev()` here because the stack order is reversed in the trace.
372        // trace: [top, ..., bottom] vs stack: [bottom, ..., top]
373        for (stack_col_idx, &value) in first_stack_top.iter().rev().enumerate() {
374            core_trace_columns[STACK_TRACE_OFFSET + STACK_TOP_OFFSET + stack_col_idx][0] = value;
375        }
376
377        // Set stack helpers: [16, 0, 0]
378        core_trace_columns[STACK_TRACE_OFFSET + B0_COL_IDX][0] = MIN_STACK_DEPTH_FELT;
379        core_trace_columns[STACK_TRACE_OFFSET + B1_COL_IDX][0] = ZERO;
380        core_trace_columns[STACK_TRACE_OFFSET + H0_COL_IDX][0] = ZERO;
381    }
382
383    // Determine the starting row indices for each fragment after the first.
384    // We skip the first due to it already being initialized above.
385    let fragment_start_row_indices = {
386        let num_fragments = core_trace_columns[0].len() / fragment_size;
387
388        (0..).step_by(fragment_size).take(num_fragments).skip(1)
389    };
390
391    // Initialize subsequent fragments with their corresponding rows from the previous fragment
392    for (row_idx, (system_row, stack_row)) in
393        fragment_start_row_indices.zip(system_rows.iter().zip(stack_rows.iter()))
394    {
395        // Copy the system_row to the first row of this fragment
396        for (col_idx, &value) in system_row.iter().enumerate() {
397            core_trace_columns[col_idx][row_idx] = value;
398        }
399
400        // Copy the stack_row to the first row of this fragment
401        for (col_idx, &value) in stack_row.iter().enumerate() {
402            core_trace_columns[STACK_TRACE_OFFSET + col_idx][row_idx] = value;
403        }
404    }
405}
406
407/// Appends a row with the HALT opcode to the end of the last fragment.
408///
409/// This ensures that the trace ends with at least one HALT operation, which is necessary to satisfy
410/// the constraints.
411fn push_halt_opcode_row(
412    core_trace_columns: &mut [Vec<Felt>],
413    last_system_state: &[Felt; SYS_TRACE_WIDTH],
414    last_stack_state: &[Felt; STACK_TRACE_WIDTH],
415) {
416    // system columns
417    // ---------------------------------------------------------------------------------------
418    for (col_idx, &value) in last_system_state.iter().enumerate() {
419        core_trace_columns[col_idx].push(value);
420    }
421
422    // stack columns
423    // ---------------------------------------------------------------------------------------
424    for (col_idx, &value) in last_stack_state.iter().enumerate() {
425        core_trace_columns[STACK_TRACE_OFFSET + col_idx].push(value);
426    }
427
428    // decoder columns: padding with final decoder state
429    // ---------------------------------------------------------------------------------------
430    // Pad addr trace (decoder block address column) with ZEROs
431    core_trace_columns[DECODER_TRACE_OFFSET + ADDR_COL_IDX].push(ZERO);
432
433    // Pad op_bits columns with HALT opcode bits
434    let halt_opcode = opcodes::HALT;
435    for bit_idx in 0..NUM_OP_BITS {
436        let bit_value = Felt::from_u8((halt_opcode >> bit_idx) & 1);
437        core_trace_columns[DECODER_TRACE_OFFSET + OP_BITS_OFFSET + bit_idx].push(bit_value);
438    }
439
440    // Pad hasher state columns (8 columns)
441    // - First 4 columns: copy the last value (to propagate program hash)
442    // - Remaining 4 columns: fill with ZEROs
443    for hasher_col_idx in 0..NUM_HASHER_COLUMNS {
444        let col_idx = DECODER_TRACE_OFFSET + HASHER_STATE_OFFSET + hasher_col_idx;
445        if hasher_col_idx < 4 {
446            // For first 4 hasher columns, copy the last value to propagate program hash
447            let last_row_idx = core_trace_columns[col_idx].len() - 1;
448            let last_hasher_value = core_trace_columns[col_idx][last_row_idx];
449            core_trace_columns[col_idx].push(last_hasher_value);
450        } else {
451            // For remaining 4 hasher columns, fill with ZEROs
452            core_trace_columns[col_idx].push(ZERO);
453        }
454    }
455
456    // Pad in_span column with ZEROs
457    core_trace_columns[DECODER_TRACE_OFFSET + IN_SPAN_COL_IDX].push(ZERO);
458
459    // Pad group_count column with ZEROs
460    core_trace_columns[DECODER_TRACE_OFFSET + GROUP_COUNT_COL_IDX].push(ZERO);
461
462    // Pad op_idx column with ZEROs
463    core_trace_columns[DECODER_TRACE_OFFSET + OP_INDEX_COL_IDX].push(ZERO);
464
465    // Pad op_batch_flags columns (3 columns) with ZEROs
466    for batch_flag_idx in 0..NUM_OP_BATCH_FLAGS {
467        let col_idx = DECODER_TRACE_OFFSET + OP_BATCH_FLAGS_OFFSET + batch_flag_idx;
468        core_trace_columns[col_idx].push(ZERO);
469    }
470
471    // Pad op_bit_extra columns (2 columns)
472    // - First column: fill with ZEROs (HALT doesn't use this)
473    // - Second column: fill with ONEs (product of two most significant HALT bits, both are 1)
474    core_trace_columns[DECODER_TRACE_OFFSET + OP_BITS_EXTRA_COLS_OFFSET].push(ZERO);
475    core_trace_columns[DECODER_TRACE_OFFSET + OP_BITS_EXTRA_COLS_OFFSET + 1].push(ONE);
476}
477
478/// Initializes the ranger checker from the recorded range checks during execution and returns it.
479///
480/// Note that the maximum number of rows that the range checker can produce is 2^16, which is less
481/// than the maximum trace length (2^29). Hence, we can safely generate the entire range checker
482/// trace and then pad it to the final trace length, without worrying about hitting memory limits.
483fn initialize_range_checker(
484    range_checker_replay: RangeCheckerReplay,
485    chiplets: &Chiplets,
486) -> RangeChecker {
487    let mut range_checker = RangeChecker::new();
488
489    // Add all u32 range checks recorded during execution
490    for (clk, values) in range_checker_replay.into_iter() {
491        range_checker.add_range_checks(clk, &values);
492    }
493
494    // Add all memory-related range checks
495    chiplets.append_range_checks(&mut range_checker);
496
497    range_checker
498}
499
500/// Replays recorded operations to populate chiplet traces. Results were already used during
501/// execution; this pass only needs the trace-recording side effects.
502fn initialize_chiplets(
503    kernel: Kernel,
504    core_trace_contexts: &[CoreTraceFragmentContext],
505    memory_writes: MemoryWritesReplay,
506    bitwise: BitwiseReplay,
507    kernel_replay: KernelReplay,
508    hasher_for_chiplet: HasherRequestReplay,
509    ace_replay: AceReplay,
510    max_trace_len: usize,
511) -> Result<Chiplets, ExecutionError> {
512    let check_chiplets_trace_len = |chiplets: &Chiplets| -> Result<(), ExecutionError> {
513        if chiplets.trace_len() > max_trace_len {
514            return Err(ExecutionError::TraceLenExceeded(max_trace_len));
515        }
516        Ok(())
517    };
518
519    let mut chiplets = Chiplets::new(kernel);
520
521    // populate hasher chiplet
522    for hasher_op in hasher_for_chiplet.into_iter() {
523        match hasher_op {
524            HasherOp::Permute(input_state) => {
525                let _ = chiplets.hasher.permute(input_state);
526                check_chiplets_trace_len(&chiplets)?;
527            },
528            HasherOp::HashControlBlock((h1, h2, domain, expected_hash)) => {
529                let _ = chiplets.hasher.hash_control_block(h1, h2, domain, expected_hash);
530                check_chiplets_trace_len(&chiplets)?;
531            },
532            HasherOp::HashBasicBlock((forest, node_id, expected_hash)) => {
533                let node = forest
534                    .get_node_by_id(node_id)
535                    .ok_or(ExecutionError::Internal("invalid node ID in hasher replay"))?;
536                let MastNode::Block(basic_block_node) = node else {
537                    return Err(ExecutionError::Internal(
538                        "expected basic block node in hasher replay",
539                    ));
540                };
541                let op_batches = basic_block_node.op_batches();
542                let _ = chiplets.hasher.hash_basic_block(op_batches, expected_hash);
543                check_chiplets_trace_len(&chiplets)?;
544            },
545            HasherOp::BuildMerkleRoot((value, path, index)) => {
546                let _ = chiplets.hasher.build_merkle_root(value, &path, index);
547                check_chiplets_trace_len(&chiplets)?;
548            },
549            HasherOp::UpdateMerkleRoot((old_value, new_value, path, index)) => {
550                chiplets.hasher.update_merkle_root(old_value, new_value, &path, index);
551                check_chiplets_trace_len(&chiplets)?;
552            },
553        }
554    }
555
556    // populate bitwise chiplet
557    for (bitwise_op, a, b) in bitwise {
558        match bitwise_op {
559            BitwiseOp::U32And => {
560                chiplets.bitwise.u32and(a, b).map_exec_err_no_ctx()?;
561                check_chiplets_trace_len(&chiplets)?;
562            },
563            BitwiseOp::U32Xor => {
564                chiplets.bitwise.u32xor(a, b).map_exec_err_no_ctx()?;
565                check_chiplets_trace_len(&chiplets)?;
566            },
567        }
568    }
569
570    // populate memory chiplet
571    //
572    // Note: care is taken to order all the accesses by clock cycle, since the memory chiplet
573    // currently assumes that all memory accesses are issued in the same order as they appear in
574    // the trace.
575    {
576        let elements_written: Box<dyn Iterator<Item = MemoryAccess>> =
577            Box::new(memory_writes.iter_elements_written().map(|(element, addr, ctx, clk)| {
578                MemoryAccess::WriteElement(*addr, *element, *ctx, *clk)
579            }));
580        let words_written: Box<dyn Iterator<Item = MemoryAccess>> = Box::new(
581            memory_writes
582                .iter_words_written()
583                .map(|(word, addr, ctx, clk)| MemoryAccess::WriteWord(*addr, *word, *ctx, *clk)),
584        );
585        let elements_read: Box<dyn Iterator<Item = MemoryAccess>> =
586            Box::new(core_trace_contexts.iter().flat_map(|ctx| {
587                ctx.replay
588                    .memory_reads
589                    .iter_read_elements()
590                    .map(|(_, addr, ctx, clk)| MemoryAccess::ReadElement(addr, ctx, clk))
591            }));
592        let words_read: Box<dyn Iterator<Item = MemoryAccess>> =
593            Box::new(core_trace_contexts.iter().flat_map(|ctx| {
594                ctx.replay
595                    .memory_reads
596                    .iter_read_words()
597                    .map(|(_, addr, ctx, clk)| MemoryAccess::ReadWord(addr, ctx, clk))
598            }));
599
600        [elements_written, words_written, elements_read, words_read]
601            .into_iter()
602            .kmerge_by(|a, b| a.clk() < b.clk())
603            .try_for_each(|mem_access| {
604                match mem_access {
605                    MemoryAccess::ReadElement(addr, ctx, clk) => chiplets
606                        .memory
607                        .read(ctx, addr, clk)
608                        .map(|_| ())
609                        .map_err(ExecutionError::MemoryErrorNoCtx)?,
610                    MemoryAccess::WriteElement(addr, element, ctx, clk) => chiplets
611                        .memory
612                        .write(ctx, addr, clk, element)
613                        .map_err(ExecutionError::MemoryErrorNoCtx)?,
614                    MemoryAccess::ReadWord(addr, ctx, clk) => chiplets
615                        .memory
616                        .read_word(ctx, addr, clk)
617                        .map(|_| ())
618                        .map_err(ExecutionError::MemoryErrorNoCtx)?,
619                    MemoryAccess::WriteWord(addr, word, ctx, clk) => chiplets
620                        .memory
621                        .write_word(ctx, addr, clk, word)
622                        .map_err(ExecutionError::MemoryErrorNoCtx)?,
623                }
624                check_chiplets_trace_len(&chiplets)
625            })?;
626
627        enum MemoryAccess {
628            ReadElement(Felt, ContextId, RowIndex),
629            WriteElement(Felt, Felt, ContextId, RowIndex),
630            ReadWord(Felt, ContextId, RowIndex),
631            WriteWord(Felt, Word, ContextId, RowIndex),
632        }
633
634        impl MemoryAccess {
635            fn clk(&self) -> RowIndex {
636                match self {
637                    MemoryAccess::ReadElement(_, _, clk) => *clk,
638                    MemoryAccess::WriteElement(_, _, _, clk) => *clk,
639                    MemoryAccess::ReadWord(_, _, clk) => *clk,
640                    MemoryAccess::WriteWord(_, _, _, clk) => *clk,
641                }
642            }
643        }
644    }
645
646    // populate ACE chiplet
647    for (clk, circuit_eval) in ace_replay.into_iter() {
648        chiplets.ace.add_circuit_evaluation(clk, circuit_eval);
649        check_chiplets_trace_len(&chiplets)?;
650    }
651
652    // populate kernel ROM
653    for proc_hash in kernel_replay.into_iter() {
654        chiplets.kernel_rom.access_proc(proc_hash).map_exec_err_no_ctx()?;
655        check_chiplets_trace_len(&chiplets)?;
656    }
657
658    Ok(chiplets)
659}
660
661fn pad_trace_columns(trace_columns: &mut [Vec<Felt>], main_trace_len: usize) {
662    let total_program_rows = trace_columns[0].len();
663    assert!(total_program_rows <= main_trace_len);
664
665    let num_padding_rows = main_trace_len - total_program_rows;
666
667    // System columns
668    // ------------------------
669
670    // Pad CLK trace - fill with index values
671    for padding_row_idx in 0..num_padding_rows {
672        trace_columns[CLK_COL_IDX]
673            .push(Felt::from_u32((total_program_rows + padding_row_idx) as u32));
674    }
675
676    // Pad CTX trace - fill with ZEROs (root context)
677    trace_columns[CTX_COL_IDX].resize(main_trace_len, ZERO);
678
679    // Pad FN_HASH traces (4 columns) - fill with ZEROs as program execution must always end in the
680    // root context.
681    for fn_hash_col_idx in FN_HASH_RANGE {
682        trace_columns[fn_hash_col_idx].resize(main_trace_len, ZERO);
683    }
684
685    // Decoder columns
686    // ------------------------
687
688    // Pad addr trace (decoder block address column) with ZEROs
689    trace_columns[DECODER_TRACE_OFFSET + ADDR_COL_IDX].resize(main_trace_len, ZERO);
690
691    // Pad op_bits columns with HALT opcode bits
692    let halt_opcode = opcodes::HALT;
693    for i in 0..NUM_OP_BITS {
694        let bit_value = Felt::from_u8((halt_opcode >> i) & 1);
695        trace_columns[DECODER_TRACE_OFFSET + OP_BITS_OFFSET + i].resize(main_trace_len, bit_value);
696    }
697
698    // Pad hasher state columns (8 columns)
699    // - First 4 columns: copy the last value (to propagate program hash)
700    // - Remaining 4 columns: fill with ZEROs
701    for i in 0..NUM_HASHER_COLUMNS {
702        let col_idx = DECODER_TRACE_OFFSET + HASHER_STATE_OFFSET + i;
703        if i < 4 {
704            // For first 4 hasher columns, copy the last value to propagate program hash
705            // Safety: per our documented safety guarantees, we know that `total_program_rows > 0`,
706            // and row `total_program_rows - 1` is initialized.
707            let last_hasher_value = trace_columns[col_idx][total_program_rows - 1];
708            trace_columns[col_idx].resize(main_trace_len, last_hasher_value);
709        } else {
710            // For remaining 4 hasher columns, fill with ZEROs
711            trace_columns[col_idx].resize(main_trace_len, ZERO);
712        }
713    }
714
715    // Pad in_span column with ZEROs
716    trace_columns[DECODER_TRACE_OFFSET + IN_SPAN_COL_IDX].resize(main_trace_len, ZERO);
717
718    // Pad group_count column with ZEROs
719    trace_columns[DECODER_TRACE_OFFSET + GROUP_COUNT_COL_IDX].resize(main_trace_len, ZERO);
720
721    // Pad op_idx column with ZEROs
722    trace_columns[DECODER_TRACE_OFFSET + OP_INDEX_COL_IDX].resize(main_trace_len, ZERO);
723
724    // Pad op_batch_flags columns (3 columns) with ZEROs
725    for i in 0..NUM_OP_BATCH_FLAGS {
726        trace_columns[DECODER_TRACE_OFFSET + OP_BATCH_FLAGS_OFFSET + i]
727            .resize(main_trace_len, ZERO);
728    }
729
730    // Pad op_bit_extra columns (2 columns)
731    // - First column: fill with ZEROs (HALT doesn't use this)
732    // - Second column: fill with ONEs (product of two most significant HALT bits, both are 1)
733    trace_columns[DECODER_TRACE_OFFSET + OP_BITS_EXTRA_COLS_OFFSET].resize(main_trace_len, ZERO);
734    trace_columns[DECODER_TRACE_OFFSET + OP_BITS_EXTRA_COLS_OFFSET + 1].resize(main_trace_len, ONE);
735
736    // Stack columns
737    // ------------------------
738
739    // Pad stack columns with the last value in each column (analogous to Stack::into_trace())
740    for i in 0..STACK_TRACE_WIDTH {
741        let col_idx = STACK_TRACE_OFFSET + i;
742        // Safety: per our documented safety guarantees, we know that `total_program_rows > 0`,
743        // and row `total_program_rows - 1` is initialized.
744        let last_stack_value = trace_columns[col_idx][total_program_rows - 1];
745        trace_columns[col_idx].resize(main_trace_len, last_stack_value);
746    }
747}
748
749/// Uses the provided `CoreTraceFragmentContext` to build and return a `ReplayProcessor` and
750/// `CoreTraceGenerationTracer` that can be used to execute the fragment.
751fn split_trace_fragment_context<'a>(
752    fragment_context: CoreTraceFragmentContext,
753    fragment: &'a mut CoreTraceFragment<'a>,
754    fragment_size: usize,
755) -> (
756    ReplayProcessor,
757    CoreTraceGenerationTracer<'a>,
758    ContinuationStack,
759    Arc<MastForest>,
760) {
761    let CoreTraceFragmentContext {
762        state: CoreTraceState { system, decoder, stack },
763        replay:
764            ExecutionReplay {
765                block_stack: block_stack_replay,
766                execution_context: execution_context_replay,
767                stack_overflow: stack_overflow_replay,
768                memory_reads: memory_reads_replay,
769                advice: advice_replay,
770                hasher: hasher_response_replay,
771                block_address: block_address_replay,
772                mast_forest_resolution: mast_forest_resolution_replay,
773            },
774        continuation,
775        initial_mast_forest,
776    } = fragment_context;
777
778    let processor = ReplayProcessor::new(
779        system,
780        stack,
781        stack_overflow_replay,
782        execution_context_replay,
783        advice_replay,
784        memory_reads_replay,
785        hasher_response_replay,
786        mast_forest_resolution_replay,
787        fragment_size.into(),
788    );
789    let tracer =
790        CoreTraceGenerationTracer::new(fragment, decoder, block_address_replay, block_stack_replay);
791
792    (processor, tracer, continuation, initial_mast_forest)
793}