use miden::core::stark::constants
#! Compute the LDE domain generator from the log2 of its size.
#!
#! Input: [log2(domain_size), ..]
#! Output: [domain_gen, ..]
#! Cycles: 63
pub proc compute_lde_generator
push.32
swap
sub
pow2
exec.constants::get_root_unity
swap
exp.u32
# => [domain_gen, ..]
end
#! Validates the inputs to the recursive verifier.
#!
#! Checks the two per-AIR `log(trace_length)` values from the stack and security
#! parameters from memory. The security parameters must already be stored in memory
#! before calling this procedure (done by the specific verifier's load_security_params).
#!
#! Input: [log(core_trace_length), log(chiplets_trace_length), ...]
#! Output: [log(core_trace_length), log(chiplets_trace_length), ...]
pub proc validate_inputs
# 1) Assert each log(trace_length) is u32.
# 2) Assert that each trace length is at most 29. The 2-adicity of our field is 32 and since
# the blowup factor is 8, we need to make sure that the LDE size is at most 2^32.
# We also check that each trace length is greater than the minimal length supported i.e., 2^6.
# The dup/drop pattern preserves both values on the stack for the caller.
dup u32assert
dup u32lt.30 assert.err="log(core_trace_length) must be less than 30"
dup u32gt.5 assert.err="log(core_trace_length) must be greater than 5"
drop
dup.1 u32assert
dup u32lt.30 assert.err="log(chiplets_trace_length) must be less than 30"
dup u32gt.5 assert.err="log(chiplets_trace_length) must be greater than 5"
drop
# 3) Assert that the number of FRI queries is at most 150. This restriction is a soft one
# and is due to the memory layout in the `constants.masm` files but can be updated
# therein.
# We also make sure that the number of FRI queries is at least 7.
exec.constants::get_number_queries
dup u32lt.151 assert.err="num_queries must be less than 151"
u32gt.6 assert.err="num_queries must be greater than 6"
# 4) Assert that PoW bit counts are less than 32 (must fit in u32 bit masking)
exec.constants::get_query_pow_bits
u32lt.32 assert.err="query_pow_bits must be less than 32"
exec.constants::get_deep_pow_bits
u32lt.32 assert.err="deep_pow_bits must be less than 32"
exec.constants::get_folding_pow_bits
u32lt.32 assert.err="folding_pow_bits must be less than 32"
end
#! Reverse the lowest `bits` bits of `index` using parallel bit-swap.
#! `pow2_shift` must equal 2^(32 - bits); the caller pre-computes it once.
#!
#! The algorithm has two parts:
#!
#! 1) Left-shift: multiply index by pow2_shift = 2^(32-bits) to place the `bits`
#! meaningful bits into the top of a 32-bit word. Since index < 2^bits, the
#! product is always < 2^32 so u32wrapping_mul is exact.
#!
#! 2) Full 32-bit reversal via 5 parallel swap steps. Each step k (for k=1,2,4,8,16)
#! swaps every adjacent group of k bits. A mask isolates the even-positioned
#! groups; the odd-positioned groups are the complement. Shifting each half by
#! k positions and combining swaps all groups simultaneously. After all 5 steps,
#! bit position i has moved to position 31-i, which (because of the initial
#! left-shift) is exactly position (bits-1-i) of the original -- i.e., the
#! reversed index. No final shift is needed.
#!
#! The two halves never overlap, so XOR = OR = ADD; we use XOR (1 cycle vs 3 for
#! u32wrapping_add).
#!
#! Input: [index, pow2_shift, ...]
#! Output: [rev_index, ...]
#! Cycles: 78
pub proc bit_reverse_len_parallel
u32wrapping_mul
# Step 1: swap adjacent bits
# x = ((x >> 1) & 0x55555555) ^ ((x & 0x55555555) << 1)
dup u32shr.1 push.0x55555555 u32and
swap push.0x55555555 u32and u32shl.1
u32xor
#=> [x1, ...]
# Step 2: swap bit pairs
# x = ((x >> 2) & 0x33333333) ^ ((x & 0x33333333) << 2)
dup u32shr.2 push.0x33333333 u32and
swap push.0x33333333 u32and u32shl.2
u32xor
#=> [x2, ...]
# Step 3: swap nibbles
# x = ((x >> 4) & 0x0F0F0F0F) ^ ((x & 0x0F0F0F0F) << 4)
dup u32shr.4 push.0x0F0F0F0F u32and
swap push.0x0F0F0F0F u32and u32shl.4
u32xor
#=> [x3, ...]
# Step 4: swap bytes
# x = ((x >> 8) & 0x00FF00FF) ^ ((x & 0x00FF00FF) << 8)
dup u32shr.8 push.0x00FF00FF u32and
swap push.0x00FF00FF u32and u32shl.8
u32xor
#=> [x4, ...]
# Step 5: swap halfwords
# x = (x >> 16) ^ (x << 16) (no mask needed, both halves fit in 16 bits)
dup u32shr.16
swap u32shl.16
u32xor
#=> [rev_index, ...]
end
#! Sets up the stark-vars region of the ACE circuit input layout.
#!
#! The stark-vars block is 10 EF slots (5 words) laid out as:
#!
#! Word 0 (slots 0-1): alpha, z^N (EF: composition challenge, trace-length power)
#! Word 1 (slots 2-3): z_k, is_first (EF: periodic eval point, first-row selector)
#! Word 2 (slots 4-5): is_last, is_transition (EF: last-row selector, transition selector)
#! Word 3 (slots 6-7): gamma, weight0 (EF: batching challenge; base as EF: barycentric weight)
#! Word 4 (slots 8-9): f, s0 (base as EF: chunk shift ratio h^N, first shift)
#!
#! Multi-AIR appends 8 more EF slots (4 words):
#! slots 10-11: β_multi_{core,chip} (set in `generate_constraint_composition_coefficients`)
#! slots 12-17: per-AIR lifted selectors for Core then Chiplets (Phase 4 below)
#!
#! Input: [max_cycle_len_log, ...]
#! Output: [...]
pub proc set_up_auxiliary_inputs_ace
# ==================================================================
# Phase 1: alpha, z^N -> Word 0 (z_k and z remain on stack)
# ==================================================================
padw exec.constants::composition_coef_ptr mem_loadw_le
# => [alpha0, alpha1, 0, 0, max_cycle_len_log, ...]
movup.3 movup.3
push.0.0 exec.constants::z_ptr mem_loadw_le
# => [z_n_0, z_n_1, z_0, z_1, alpha0, alpha1, max_cycle_len_log, ...]
# Compute z_k = z^(N / max_cycle_len) via repeated squaring.
exec.constants::get_trace_length_log
movup.7
sub
# => [k, z_n_0, z_n_1, z_0, z_1, alpha0, alpha1, ...]
# where k = log(trace_len) - log(max_cycle_len)
dup.4 dup.4
# => [z_0, z_1, k, z_n_0, z_n_1, z_0, z_1, alpha0, alpha1, ...]
push.1
while.true
dup.1 dup.1
ext2mul
dup.2 sub.1 swap.3
push.1 neq
end
movup.2 drop
# => [z_k_0, z_k_1, z_n_0, z_n_1, z_0, z_1, alpha0, alpha1, ...]
# Rearrange for Word 0 = [alpha0, alpha1, z_n_0, z_n_1]
movup.3 movup.3
# => [z_n_0, z_n_1, z_k_0, z_k_1, z_0, z_1, alpha0, alpha1, ...]
movup.7 movup.7
# => [alpha0, alpha1, z_n_0, z_n_1, z_k_0, z_k_1, z_0, z_1, ...]
exec.constants::auxiliary_ace_inputs_ptr
mem_storew_le
drop drop
# => [z_n_0, z_n_1, z_k_0, z_k_1, z_0, z_1, ...]
# ==================================================================
# Phase 2: is_first, is_last, is_transition -> Words 1-2
# ==================================================================
# Compute vanishing = z^N - 1 (z^N is still on the stack from Phase 1).
sub.1
# => [van_0, van_1, z_k_0, z_k_1, z_0, z_1, ...]
# is_first = vanishing * inv(z - 1)
dup.5 dup.5
# => [z_0, z_1, van_0, van_1, z_k_0, z_k_1, z_0, z_1, ...]
sub.1 ext2inv
# => [inv_z1_0, inv_z1_1, van_0, van_1, z_k_0, z_k_1, z_0, z_1, ...]
dup.3 dup.3
# => [van_0, van_1, inv_z1_0, inv_z1_1, van_0, van_1, z_k_0, z_k_1, z_0, z_1, ...]
ext2mul
# => [is_first_0, is_first_1, van_0, van_1, z_k_0, z_k_1, z_0, z_1, ...]
# Store Word 1: [z_k_0, z_k_1, is_first_0, is_first_1]
movup.5 movup.5
# => [z_k_0, z_k_1, is_first_0, is_first_1, van_0, van_1, z_0, z_1, ...]
exec.constants::auxiliary_ace_inputs_ptr add.4
mem_storew_le
dropw
# => [van_0, van_1, z_0, z_1, ...]
# is_transition = z - g^{-1} (g_inv is base-field, only coord 0 changes)
exec.constants::get_trace_domain_generator
inv
# => [g_inv, van_0, van_1, z_0, z_1, ...]
movup.3 swap sub
# => [is_trans_0, van_0, van_1, z_1, ...]
movup.3
swap
# => [is_trans_0, is_trans_1, van_0, van_1, ...]
# is_last = vanishing * inv(is_transition)
dup.1 dup.1
# => [is_trans_0, is_trans_1, is_trans_0, is_trans_1, van_0, van_1, ...]
ext2inv
# => [inv_t_0, inv_t_1, is_trans_0, is_trans_1, van_0, van_1, ...]
movup.5 movup.5
# => [van_0, van_1, inv_t_0, inv_t_1, is_trans_0, is_trans_1, ...]
ext2mul
# => [is_last_0, is_last_1, is_trans_0, is_trans_1, ...]
# Store Word 2: [is_last_0, is_last_1, is_trans_0, is_trans_1]
exec.constants::auxiliary_ace_inputs_ptr add.8
mem_storew_le
dropw
# => [...]
# ==================================================================
# Phase 3: f, s0, weight0, gamma -> Words 3-4
# ==================================================================
# Compute s0 = offset^N, where the LDE coset offset is g^(2^(TWO_ADICITY - log_lde))
# (g = 7, the field generator). Since log_lde = log(trace_len) + log_blowup, we have
# s0 = g^(2^(TWO_ADICITY - log_blowup)) = 7^(2^(32 - 3)) = 7^(2^29), independent of the
# trace height. Computed by repeated squaring.
push.7
repeat.29
dup mul
end
push.0 swap
# => [s0, 0, ...]
# Compute f = h^N (chunk shift ratio between cosets)
push.0
exec.constants::get_lde_domain_generator
exec.constants::get_trace_length
exp.u32
# => [f, 0, s0, 0, ...]
# Compute weight0 = 1 / (8 * s0^7)
dup.2 dup
mul
# => [s0^2, f, 0, s0, 0, ...]
dup dup mul
# => [s0^4, s0^2, f, 0, s0, 0, ...]
mul
# => [s0^6, f, 0, s0, 0, ...]
dup.3 mul
mul.8 inv
# => [weight0, f, 0, s0, 0, ...]
# Store Word 4: [f, 0, s0, 0]
movdn.4
# => [f, 0, s0, 0, weight0, ...]
exec.constants::auxiliary_ace_inputs_ptr add.16
mem_storew_le
dropw
# => [weight0, ...]
# Store Word 3: [gamma_0, gamma_1, weight0, 0]
# weight0 is still on the stack -- sample gamma and assemble the word.
exec.sample_gamma
# => [gamma_0, gamma_1, weight0, ...]
push.0 movdn.3
# => [gamma_0, gamma_1, weight0, 0, ...]
exec.constants::auxiliary_ace_inputs_ptr add.12
mem_storew_le
dropw
# Phase 4: per-AIR lifted selectors at `z^{r_j}` (r_j = 2^(log_max - log_j)).
# When AIR is at log_max (r=0) the values equal the canonical selectors.
# Layout (offsets from AUXILIARY_ACE_INPUTS_PTR):
# +24/+26/+28: is_first_core, is_last_core, is_transition_core
# +30/+32/+34: is_first_chip, is_last_chip, is_transition_chip
exec.constants::get_core_trace_length_log
push.24
exec.compute_per_air_selectors
exec.constants::get_chiplets_trace_length_log
push.30
exec.compute_per_air_selectors
end
#! Load the quad field element stored at `[addr, addr+1]` onto the stack.
#!
#! Input: [addr, ...]
#! Output: [q0, q1, ...] (q0 = mem[addr], q1 = mem[addr+1])
proc load_quad
dup add.1 mem_load
swap mem_load
end
#! Compute is_first/is_last/is_transition for AIR at `log_air` at `out_offset` in the
#! stark-vars region. `r_log = log_max - log_air` stashed in tmp4 across the two
#! repeated-squaring passes (z_lift, g_air_inv).
#!
#! Input: [out_offset, log_air, ...]
#! Output: [...]
proc compute_per_air_selectors
# r_log = log_max - log_air
swap
exec.constants::get_trace_length_log
swap sub
# => [r_log, out_offset, ...]
exec.constants::tmp4 mem_store
# => [out_offset, ...]
# Compute z_lift = z^(2^r_log).
exec.constants::z_ptr add.2 exec.load_quad
# => [z_0, z_1, out_offset, ...]
exec.constants::tmp4 mem_load
# => [r_log, z_0, z_1, out_offset, ...]
dup push.0 neq
while.true
movdn.2
dup.1 dup.1 ext2mul
movup.2 sub.1
dup push.0 neq
end
drop
# => [z_lift_0, z_lift_1, out_offset, ...]
# vanishing = z^N - 1 (loaded from slot 1 = offset +2/+3).
exec.constants::auxiliary_ace_inputs_ptr add.2 exec.load_quad
sub.1
# => [van_0, van_1, z_lift_0, z_lift_1, out_offset, ...]
# is_first_air = van / (z_lift - 1), stored at out_offset + 0/+1.
dup.3 dup.3
sub.1 ext2inv
dup.3 dup.3 ext2mul
# => [if_0, if_1, van_0, van_1, z_lift_0, z_lift_1, out_offset, ...]
dup.6 exec.constants::auxiliary_ace_inputs_ptr add mem_store
dup.5 exec.constants::auxiliary_ace_inputs_ptr add.1 add mem_store
# => [van_0, van_1, z_lift_0, z_lift_1, out_offset, ...]
# Compute g_air_inv = (g_max_inv)^(2^r_log).
exec.constants::get_trace_domain_generator
inv
exec.constants::tmp4 mem_load
# => [r_log, g_max_inv, van_0, van_1, z_lift_0, z_lift_1, out_offset, ...]
dup push.0 neq
while.true
swap dup mul
swap sub.1
dup push.0 neq
end
drop
# => [g_air_inv, van_0, van_1, z_lift_0, z_lift_1, out_offset, ...]
# is_transition_air = z_lift - g_air_inv (g_air_inv is base; only coord 0 changes).
movup.3 swap sub
movup.3
swap
# => [it_0, it_1, van_0, van_1, out_offset, ...]
# is_last_air = van / is_transition_air.
dup.1 dup.1
ext2inv
movup.5 movup.5
ext2mul
# => [il_0, il_1, it_0, it_1, out_offset, ...]
# Store is_last_air at out_offset + 2/+3.
dup.4 exec.constants::auxiliary_ace_inputs_ptr add.2 add mem_store
dup.3 exec.constants::auxiliary_ace_inputs_ptr add.3 add mem_store
# => [it_0, it_1, out_offset, ...]
# Store is_transition_air at out_offset + 4/+5.
dup.2 exec.constants::auxiliary_ace_inputs_ptr add.4 add mem_store
dup.1 exec.constants::auxiliary_ace_inputs_ptr add.5 add mem_store
# => [out_offset, ...]
drop
end
#! Sample the batching challenge gamma (EF element).
#!
#! gamma batches the constraint evaluation with the auxiliary trace boundary checks:
#! output = constraint_check + gamma * product_check + gamma^2 * sum_check
#!
#! WORKAROUND (unsound): Reads gamma directly from the sponge rate buffer without
#! consuming it. This gives gamma == deep_alpha (the next challenge that will be
#! properly sampled in generate_deep_composition_random_coefficients). Using the
#! same value for both is unsound.
#!
#! TODO: Replace with a proper `sample_ext` call once the Rust-side transcript
#! draws gamma between the OOD and DEEP phases.
#!
#! Input: [...]
#! Output: [gamma_0, gamma_1, ...]
proc sample_gamma
# Assert output_len == 8 so the rate buffer indices are correct.
exec.constants::random_coin_output_len_ptr mem_load push.8 assert_eq
# Read the top 2 elements of the sponge rate buffer.
exec.constants::r1_ptr add.7 mem_load
exec.constants::r1_ptr add.6 mem_load
swap
end
#! Executes the constraints evaluation check.
#!
#! Inputs: [...]
#! Outputs: [...]
#!
#! Invocation: exec
pub proc execute_constraint_evaluation_check
exec.constants::get_procedure_digest_execute_constraint_evaluation_check_ptr dynexec
end
# OOD MEMORY SHUFFLE
# =================================================================================================
#
# When `core_height > chiplets_height`, proof_order is `[Chiplets, Core]` but the codegen's slot
# indexing is fixed in caller_order. The procs below conditionally swap per-AIR halves of the OOD
# frame and aux_bus_boundary so eval_circuit reads the right values.
#! Copy `n_words` consecutive words from `src` to `dst`. Forward iteration; safe for left-shifts.
#! Uses TMP3/TMP4 as iterator state.
#!
#! Input: [n_words, dst, src, ...]
#! Output: [...]
proc copy_words_fwd
swap exec.constants::tmp4 mem_store
swap exec.constants::tmp3 mem_store
# => [n_words, ...]
dup push.0 neq
while.true
sub.1
padw exec.constants::tmp3 mem_load mem_loadw_le
exec.constants::tmp4 mem_load mem_storew_le
dropw
exec.constants::tmp3 mem_load add.4
exec.constants::tmp3 mem_store
exec.constants::tmp4 mem_load add.4
exec.constants::tmp4 mem_store
dup push.0 neq
end
drop
end
#! Swap `mem[base..base+16]` ↔ `mem[base+16..base+32]` via SWAP_SCRATCH.
#! Input: [base, ...]
proc swap_two_4_word_blocks
dup
exec.constants::swap_scratch_ptr push.4
exec.copy_words_fwd
dup add.16 dup.1 push.4
exec.copy_words_fwd
add.16 exec.constants::swap_scratch_ptr swap push.4
exec.copy_words_fwd
end
#! Swap [chip_main(48 felts), core_main(112 felts)] → [core_main, chip_main] at `base`.
#! Input: [base, ...]
proc swap_chip_core_main_blocks
dup
exec.constants::swap_scratch_ptr push.12
exec.copy_words_fwd
dup add.48 dup.1 push.28
exec.copy_words_fwd
add.112 exec.constants::swap_scratch_ptr swap push.12
exec.copy_words_fwd
end
#! Swap the two EF halves of the aux_bus_boundary word: [c0, c1, k0, k1] → [k0, k1, c0, c1].
proc swap_aux_bus_boundary_word
padw exec.constants::aux_bus_boundary_ptr mem_loadw_le
movup.2 movup.3 swap
exec.constants::aux_bus_boundary_ptr mem_storew_le
dropw
end
#! Shuffle the OOD frame and aux_bus_boundary from proof_order to caller_order if
#! `log_chip < log_core` (i.e. proof_order = [Chiplets, Core]). No-op otherwise.
#! MUST run after OOD streaming + aux observation, before eval_circuit.
pub proc swap_air_regions_if_chip_first
exec.constants::get_chiplets_trace_length_log
exec.constants::get_core_trace_length_log
u32lt
# 1 iff log_chip < log_core (proof_order = [Chiplets, Core])
if.true
# main_curr at OOD_EVALUATIONS_PTR + 0
exec.constants::ood_evaluations_ptr
exec.swap_chip_core_main_blocks
# aux_curr at OOD_EVALUATIONS_PTR + 160 (16-felt symmetric swap)
exec.constants::ood_evaluations_ptr add.160
exec.swap_two_4_word_blocks
# main_next at OOD_EVALUATIONS_PTR + 224
exec.constants::ood_evaluations_ptr add.224
exec.swap_chip_core_main_blocks
# aux_next at OOD_EVALUATIONS_PTR + 384
exec.constants::ood_evaluations_ptr add.384
exec.swap_two_4_word_blocks
# aux_bus_boundary
exec.swap_aux_bus_boundary_word
end
end
#! Observes the auxiliary trace: draws aux randomness, reseeds with the aux trace commitment,
#! and absorbs aux trace boundary values into the transcript.
#!
#! For AIRs without an auxiliary trace, the implementation should be a no-op.
#!
#! Inputs: [...]
#! Outputs: [...]
#!
#! Invocation: exec
pub proc observe_aux_trace
exec.constants::get_procedure_digest_observe_aux_trace_ptr dynexec
end
#! Stores digests of dynamically executed procedures.
#!
#! Input: [D0, D1, D2, D3, D4, ...]
#! Output: [...]
pub proc store_dynamically_executed_procedures
exec.constants::get_procedure_digest_observe_aux_trace_ptr mem_storew_le dropw
exec.constants::get_procedure_digest_process_public_inputs_ptr mem_storew_le dropw
exec.constants::get_procedure_digest_process_row_ood_evaluations_ptr mem_storew_le dropw
exec.constants::get_procedure_digest_execute_constraint_evaluation_check_ptr mem_storew_le dropw
exec.constants::get_procedure_digest_compute_deep_composition_polynomial_queries_ptr mem_storew_le dropw
# => [...]
end