miden-core-lib 0.24.2

use miden::core::stark::constants

#! Compute the LDE domain generator from the log2 of its size.
#!
#! Input: [log2(domain_size), ..]
#! Output: [domain_gen, ..]
#! Cycles: 63
pub proc compute_lde_generator
    push.32
    swap
    sub
    pow2
    exec.constants::get_root_unity
    swap
    exp.u32
    # => [domain_gen, ..]
end

#! Validates the inputs to the recursive verifier.
#!
#! Checks the two per-AIR `log(trace_length)` values from the stack and security
#! parameters from memory. The security parameters must already be stored in memory
#! before calling this procedure (done by the specific verifier's load_security_params).
#!
#! Input: [log(core_trace_length), log(chiplets_trace_length), ...]
#! Output: [log(core_trace_length), log(chiplets_trace_length), ...]
pub proc validate_inputs
    # 1) Assert each log(trace_length) is u32.
    # 2) Assert that each trace length is at most 29. The 2-adicity of our field is 32 and since
    #    the blowup factor is 8, we need to make sure that the LDE size is at most 2^32.
    #    We also check that each trace length is greater than the minimal length supported i.e., 2^6.
    #    The dup/drop pattern preserves both values on the stack for the caller.
    dup u32assert
    dup u32lt.30 assert.err="log(core_trace_length) must be less than 30"
    dup u32gt.5 assert.err="log(core_trace_length) must be greater than 5"
    drop

    dup.1 u32assert
    dup u32lt.30 assert.err="log(chiplets_trace_length) must be less than 30"
    dup u32gt.5 assert.err="log(chiplets_trace_length) must be greater than 5"
    drop

    # 3) Assert that the number of FRI queries is at most 150. This restriction is a soft one
    #    and is due to the memory layout in the `constants.masm` files but can be updated
    #    therein.
    #    We also make sure that the number of FRI queries is at least 7.
    exec.constants::get_number_queries
    dup u32lt.151 assert.err="num_queries must be less than 151"
    u32gt.6 assert.err="num_queries must be greater than 6"

    # 4) Assert that PoW bit counts are less than 32 (must fit in u32 bit masking)
    exec.constants::get_query_pow_bits
    u32lt.32 assert.err="query_pow_bits must be less than 32"

    exec.constants::get_deep_pow_bits
    u32lt.32 assert.err="deep_pow_bits must be less than 32"

    exec.constants::get_folding_pow_bits
    u32lt.32 assert.err="folding_pow_bits must be less than 32"
end

#! Reverse the lowest `bits` bits of `index` using parallel bit-swap.
#! `pow2_shift` must equal 2^(32 - bits); the caller pre-computes it once.
#!
#! The algorithm has two parts:
#!
#! 1) Left-shift: multiply index by pow2_shift = 2^(32-bits) to place the `bits`
#!    meaningful bits into the top of a 32-bit word. Since index < 2^bits, the
#!    product is always < 2^32 so u32wrapping_mul is exact.
#!
#! 2) Full 32-bit reversal via 5 parallel swap steps. Each step k (for k=1,2,4,8,16)
#!    swaps every adjacent group of k bits. A mask isolates the even-positioned
#!    groups; the odd-positioned groups are the complement. Shifting each half by
#!    k positions and combining swaps all groups simultaneously. After all 5 steps,
#!    bit position i has moved to position 31-i, which (because of the initial
#!    left-shift) is exactly position (bits-1-i) of the original -- i.e., the
#!    reversed index. No final shift is needed.
#!
#! The two halves never overlap, so XOR = OR = ADD; we use XOR (1 cycle vs 3 for
#! u32wrapping_add).
#!
#! Input: [index, pow2_shift, ...]
#! Output: [rev_index, ...]
#! Cycles: 78
pub proc bit_reverse_len_parallel
    u32wrapping_mul

    # Step 1: swap adjacent bits
    # x = ((x >> 1) & 0x55555555) ^ ((x & 0x55555555) << 1)
    dup u32shr.1 push.0x55555555 u32and
    swap push.0x55555555 u32and u32shl.1
    u32xor
    #=> [x1, ...]

    # Step 2: swap bit pairs
    # x = ((x >> 2) & 0x33333333) ^ ((x & 0x33333333) << 2)
    dup u32shr.2 push.0x33333333 u32and
    swap push.0x33333333 u32and u32shl.2
    u32xor
    #=> [x2, ...]

    # Step 3: swap nibbles
    # x = ((x >> 4) & 0x0F0F0F0F) ^ ((x & 0x0F0F0F0F) << 4)
    dup u32shr.4 push.0x0F0F0F0F u32and
    swap push.0x0F0F0F0F u32and u32shl.4
    u32xor
    #=> [x3, ...]

    # Step 4: swap bytes
    # x = ((x >> 8) & 0x00FF00FF) ^ ((x & 0x00FF00FF) << 8)
    dup u32shr.8 push.0x00FF00FF u32and
    swap push.0x00FF00FF u32and u32shl.8
    u32xor
    #=> [x4, ...]

    # Step 5: swap halfwords
    # x = (x >> 16) ^ (x << 16)  (no mask needed, both halves fit in 16 bits)
    dup u32shr.16
    swap u32shl.16
    u32xor
    #=> [rev_index, ...]
end

#! Sets up the stark-vars region of the ACE circuit input layout.
#!
#! The stark-vars block is 10 EF slots (5 words) laid out as:
#!
#!   Word 0 (slots 0-1): alpha, z^N                (EF: composition challenge, trace-length power)
#!   Word 1 (slots 2-3): z_k, is_first             (EF: periodic eval point, first-row selector)
#!   Word 2 (slots 4-5): is_last, is_transition    (EF: last-row selector, transition selector)
#!   Word 3 (slots 6-7): gamma, weight0            (EF: batching challenge; base as EF: barycentric weight)
#!   Word 4 (slots 8-9): f, s0                     (base as EF: chunk shift ratio h^N, first shift)
#!
#! Multi-AIR appends 8 more EF slots (4 words):
#!   slots 10-11: β_multi_{core,chip}  (set in `generate_constraint_composition_coefficients`)
#!   slots 12-17: per-AIR lifted selectors for Core then Chiplets (Phase 4 below)
#!
#! Input: [max_cycle_len_log, ...]
#! Output: [...]
pub proc set_up_auxiliary_inputs_ace
    # ==================================================================
    # Phase 1: alpha, z^N -> Word 0  (z_k and z remain on stack)
    # ==================================================================

    padw exec.constants::composition_coef_ptr mem_loadw_le
    # => [alpha0, alpha1, 0, 0, max_cycle_len_log, ...]

    movup.3 movup.3
    push.0.0 exec.constants::z_ptr mem_loadw_le
    # => [z_n_0, z_n_1, z_0, z_1, alpha0, alpha1, max_cycle_len_log, ...]

    # Compute z_k = z^(N / max_cycle_len) via repeated squaring.
    exec.constants::get_trace_length_log
    movup.7
    sub
    # => [k, z_n_0, z_n_1, z_0, z_1, alpha0, alpha1, ...]
    #    where k = log(trace_len) - log(max_cycle_len)

    dup.4 dup.4
    # => [z_0, z_1, k, z_n_0, z_n_1, z_0, z_1, alpha0, alpha1, ...]

    push.1
    while.true
        dup.1 dup.1
        ext2mul
        dup.2 sub.1 swap.3
        push.1 neq
    end

    movup.2 drop
    # => [z_k_0, z_k_1, z_n_0, z_n_1, z_0, z_1, alpha0, alpha1, ...]

    # Rearrange for Word 0 = [alpha0, alpha1, z_n_0, z_n_1]
    movup.3 movup.3
    # => [z_n_0, z_n_1, z_k_0, z_k_1, z_0, z_1, alpha0, alpha1, ...]
    movup.7 movup.7
    # => [alpha0, alpha1, z_n_0, z_n_1, z_k_0, z_k_1, z_0, z_1, ...]

    exec.constants::auxiliary_ace_inputs_ptr
    mem_storew_le
    drop drop
    # => [z_n_0, z_n_1, z_k_0, z_k_1, z_0, z_1, ...]

    # ==================================================================
    # Phase 2: is_first, is_last, is_transition -> Words 1-2
    # ==================================================================

    # Compute vanishing = z^N - 1 (z^N is still on the stack from Phase 1).
    sub.1
    # => [van_0, van_1, z_k_0, z_k_1, z_0, z_1, ...]

    # is_first = vanishing * inv(z - 1)
    dup.5 dup.5
    # => [z_0, z_1, van_0, van_1, z_k_0, z_k_1, z_0, z_1, ...]
    sub.1 ext2inv
    # => [inv_z1_0, inv_z1_1, van_0, van_1, z_k_0, z_k_1, z_0, z_1, ...]
    dup.3 dup.3
    # => [van_0, van_1, inv_z1_0, inv_z1_1, van_0, van_1, z_k_0, z_k_1, z_0, z_1, ...]
    ext2mul
    # => [is_first_0, is_first_1, van_0, van_1, z_k_0, z_k_1, z_0, z_1, ...]

    # Store Word 1: [z_k_0, z_k_1, is_first_0, is_first_1]
    movup.5 movup.5
    # => [z_k_0, z_k_1, is_first_0, is_first_1, van_0, van_1, z_0, z_1, ...]
    exec.constants::auxiliary_ace_inputs_ptr add.4
    mem_storew_le
    dropw
    # => [van_0, van_1, z_0, z_1, ...]

    # is_transition = z - g^{-1}  (g_inv is base-field, only coord 0 changes)
    exec.constants::get_trace_domain_generator
    inv
    # => [g_inv, van_0, van_1, z_0, z_1, ...]
    movup.3 swap sub
    # => [is_trans_0, van_0, van_1, z_1, ...]
    movup.3
    swap
    # => [is_trans_0, is_trans_1, van_0, van_1, ...]

    # is_last = vanishing * inv(is_transition)
    dup.1 dup.1
    # => [is_trans_0, is_trans_1, is_trans_0, is_trans_1, van_0, van_1, ...]
    ext2inv
    # => [inv_t_0, inv_t_1, is_trans_0, is_trans_1, van_0, van_1, ...]
    movup.5 movup.5
    # => [van_0, van_1, inv_t_0, inv_t_1, is_trans_0, is_trans_1, ...]
    ext2mul
    # => [is_last_0, is_last_1, is_trans_0, is_trans_1, ...]

    # Store Word 2: [is_last_0, is_last_1, is_trans_0, is_trans_1]
    exec.constants::auxiliary_ace_inputs_ptr add.8
    mem_storew_le
    dropw
    # => [...]

    # ==================================================================
    # Phase 3: f, s0, weight0, gamma -> Words 3-4
    # ==================================================================

    # Compute s0 = offset^N, where the LDE coset offset is g^(2^(TWO_ADICITY - log_lde))
    # (g = 7, the field generator). Since log_lde = log(trace_len) + log_blowup, we have
    # s0 = g^(2^(TWO_ADICITY - log_blowup)) = 7^(2^(32 - 3)) = 7^(2^29), independent of the
    # trace height. Computed by repeated squaring.
    push.7
    repeat.29
        dup mul
    end
    push.0 swap
    # => [s0, 0, ...]

    # Compute f = h^N (chunk shift ratio between cosets)
    push.0
    exec.constants::get_lde_domain_generator
    exec.constants::get_trace_length
    exp.u32
    # => [f, 0, s0, 0, ...]

    # Compute weight0 = 1 / (8 * s0^7)
    dup.2 dup
    mul
    # => [s0^2, f, 0, s0, 0, ...]
    dup dup mul
    # => [s0^4, s0^2, f, 0, s0, 0, ...]
    mul
    # => [s0^6, f, 0, s0, 0, ...]
    dup.3 mul
    mul.8 inv
    # => [weight0, f, 0, s0, 0, ...]

    # Store Word 4: [f, 0, s0, 0]
    movdn.4
    # => [f, 0, s0, 0, weight0, ...]
    exec.constants::auxiliary_ace_inputs_ptr add.16
    mem_storew_le
    dropw
    # => [weight0, ...]

    # Store Word 3: [gamma_0, gamma_1, weight0, 0]
    # weight0 is still on the stack -- sample gamma and assemble the word.
    exec.sample_gamma
    # => [gamma_0, gamma_1, weight0, ...]
    push.0 movdn.3
    # => [gamma_0, gamma_1, weight0, 0, ...]
    exec.constants::auxiliary_ace_inputs_ptr add.12
    mem_storew_le
    dropw

    # Phase 4: per-AIR lifted selectors at `z^{r_j}` (r_j = 2^(log_max - log_j)).
    # When AIR is at log_max (r=0) the values equal the canonical selectors.
    # Layout (offsets from AUXILIARY_ACE_INPUTS_PTR):
    #   +24/+26/+28: is_first_core, is_last_core, is_transition_core
    #   +30/+32/+34: is_first_chip, is_last_chip, is_transition_chip
    exec.constants::get_core_trace_length_log
    push.24
    exec.compute_per_air_selectors

    exec.constants::get_chiplets_trace_length_log
    push.30
    exec.compute_per_air_selectors
end

#! Load the quad field element stored at `[addr, addr+1]` onto the stack.
#!
#! Input:  [addr, ...]
#! Output: [q0, q1, ...]  (q0 = mem[addr], q1 = mem[addr+1])
proc load_quad
    dup add.1 mem_load
    swap mem_load
end

#! Compute is_first/is_last/is_transition for AIR at `log_air` at `out_offset` in the
#! stark-vars region. `r_log = log_max - log_air` stashed in tmp4 across the two
#! repeated-squaring passes (z_lift, g_air_inv).
#!
#! Input:  [out_offset, log_air, ...]
#! Output: [...]
proc compute_per_air_selectors
    # r_log = log_max - log_air
    swap
    exec.constants::get_trace_length_log
    swap sub
    # => [r_log, out_offset, ...]
    exec.constants::tmp4 mem_store
    # => [out_offset, ...]

    # Compute z_lift = z^(2^r_log).
    exec.constants::z_ptr add.2 exec.load_quad
    # => [z_0, z_1, out_offset, ...]
    exec.constants::tmp4 mem_load
    # => [r_log, z_0, z_1, out_offset, ...]
    dup push.0 neq
    while.true
        movdn.2
        dup.1 dup.1 ext2mul
        movup.2 sub.1
        dup push.0 neq
    end
    drop
    # => [z_lift_0, z_lift_1, out_offset, ...]

    # vanishing = z^N - 1 (loaded from slot 1 = offset +2/+3).
    exec.constants::auxiliary_ace_inputs_ptr add.2 exec.load_quad
    sub.1
    # => [van_0, van_1, z_lift_0, z_lift_1, out_offset, ...]

    # is_first_air = van / (z_lift - 1), stored at out_offset + 0/+1.
    dup.3 dup.3
    sub.1 ext2inv
    dup.3 dup.3 ext2mul
    # => [if_0, if_1, van_0, van_1, z_lift_0, z_lift_1, out_offset, ...]
    dup.6 exec.constants::auxiliary_ace_inputs_ptr add mem_store
    dup.5 exec.constants::auxiliary_ace_inputs_ptr add.1 add mem_store
    # => [van_0, van_1, z_lift_0, z_lift_1, out_offset, ...]

    # Compute g_air_inv = (g_max_inv)^(2^r_log).
    exec.constants::get_trace_domain_generator
    inv
    exec.constants::tmp4 mem_load
    # => [r_log, g_max_inv, van_0, van_1, z_lift_0, z_lift_1, out_offset, ...]
    dup push.0 neq
    while.true
        swap dup mul
        swap sub.1
        dup push.0 neq
    end
    drop
    # => [g_air_inv, van_0, van_1, z_lift_0, z_lift_1, out_offset, ...]

    # is_transition_air = z_lift - g_air_inv (g_air_inv is base; only coord 0 changes).
    movup.3 swap sub
    movup.3
    swap
    # => [it_0, it_1, van_0, van_1, out_offset, ...]

    # is_last_air = van / is_transition_air.
    dup.1 dup.1
    ext2inv
    movup.5 movup.5
    ext2mul
    # => [il_0, il_1, it_0, it_1, out_offset, ...]

    # Store is_last_air at out_offset + 2/+3.
    dup.4 exec.constants::auxiliary_ace_inputs_ptr add.2 add mem_store
    dup.3 exec.constants::auxiliary_ace_inputs_ptr add.3 add mem_store
    # => [it_0, it_1, out_offset, ...]

    # Store is_transition_air at out_offset + 4/+5.
    dup.2 exec.constants::auxiliary_ace_inputs_ptr add.4 add mem_store
    dup.1 exec.constants::auxiliary_ace_inputs_ptr add.5 add mem_store
    # => [out_offset, ...]
    drop
end

#! Sample the batching challenge gamma (EF element).
#!
#! gamma batches the constraint evaluation with the auxiliary trace boundary checks:
#!   output = constraint_check + gamma * product_check + gamma^2 * sum_check
#!
#! WORKAROUND (unsound): Reads gamma directly from the sponge rate buffer without
#! consuming it. This gives gamma == deep_alpha (the next challenge that will be
#! properly sampled in generate_deep_composition_random_coefficients). Using the
#! same value for both is unsound.
#!
#! TODO: Replace with a proper `sample_ext` call once the Rust-side transcript
#! draws gamma between the OOD and DEEP phases.
#!
#! Input:  [...]
#! Output: [gamma_0, gamma_1, ...]
proc sample_gamma
    # Assert output_len == 8 so the rate buffer indices are correct.
    exec.constants::random_coin_output_len_ptr mem_load push.8 assert_eq
    # Read the top 2 elements of the sponge rate buffer.
    exec.constants::r1_ptr add.7 mem_load
    exec.constants::r1_ptr add.6 mem_load
    swap
end

#! Executes the constraints evaluation check.
#!
#! Inputs:  [...]
#! Outputs: [...]
#!
#! Invocation: exec
pub proc execute_constraint_evaluation_check
    exec.constants::get_procedure_digest_execute_constraint_evaluation_check_ptr dynexec
end

# OOD MEMORY SHUFFLE
# =================================================================================================
#
# When `core_height > chiplets_height`, proof_order is `[Chiplets, Core]` but the codegen's slot
# indexing is fixed in caller_order. The procs below conditionally swap per-AIR halves of the OOD
# frame and aux_bus_boundary so eval_circuit reads the right values.

#! Copy `n_words` consecutive words from `src` to `dst`. Forward iteration; safe for left-shifts.
#! Uses TMP3/TMP4 as iterator state.
#!
#! Input:  [n_words, dst, src, ...]
#! Output: [...]
proc copy_words_fwd
    swap exec.constants::tmp4 mem_store
    swap exec.constants::tmp3 mem_store
    # => [n_words, ...]

    dup push.0 neq
    while.true
        sub.1

        padw exec.constants::tmp3 mem_load mem_loadw_le
        exec.constants::tmp4 mem_load mem_storew_le
        dropw

        exec.constants::tmp3 mem_load add.4
        exec.constants::tmp3 mem_store
        exec.constants::tmp4 mem_load add.4
        exec.constants::tmp4 mem_store

        dup push.0 neq
    end
    drop
end

#! Swap `mem[base..base+16]` ↔ `mem[base+16..base+32]` via SWAP_SCRATCH.
#! Input: [base, ...]
proc swap_two_4_word_blocks
    dup
    exec.constants::swap_scratch_ptr push.4
    exec.copy_words_fwd

    dup add.16 dup.1 push.4
    exec.copy_words_fwd

    add.16 exec.constants::swap_scratch_ptr swap push.4
    exec.copy_words_fwd
end

#! Swap [chip_main(48 felts), core_main(112 felts)] → [core_main, chip_main] at `base`.
#! Input: [base, ...]
proc swap_chip_core_main_blocks
    dup
    exec.constants::swap_scratch_ptr push.12
    exec.copy_words_fwd

    dup add.48 dup.1 push.28
    exec.copy_words_fwd

    add.112 exec.constants::swap_scratch_ptr swap push.12
    exec.copy_words_fwd
end

#! Swap the two EF halves of the aux_bus_boundary word: [c0, c1, k0, k1] → [k0, k1, c0, c1].
proc swap_aux_bus_boundary_word
    padw exec.constants::aux_bus_boundary_ptr mem_loadw_le
    movup.2 movup.3 swap
    exec.constants::aux_bus_boundary_ptr mem_storew_le
    dropw
end

#! Shuffle the OOD frame and aux_bus_boundary from proof_order to caller_order if
#! `log_chip < log_core` (i.e. proof_order = [Chiplets, Core]). No-op otherwise.
#! MUST run after OOD streaming + aux observation, before eval_circuit.
pub proc swap_air_regions_if_chip_first
    exec.constants::get_chiplets_trace_length_log
    exec.constants::get_core_trace_length_log
    u32lt
    # 1 iff log_chip < log_core (proof_order = [Chiplets, Core])
    if.true
        # main_curr at OOD_EVALUATIONS_PTR + 0
        exec.constants::ood_evaluations_ptr
        exec.swap_chip_core_main_blocks

        # aux_curr at OOD_EVALUATIONS_PTR + 160 (16-felt symmetric swap)
        exec.constants::ood_evaluations_ptr add.160
        exec.swap_two_4_word_blocks

        # main_next at OOD_EVALUATIONS_PTR + 224
        exec.constants::ood_evaluations_ptr add.224
        exec.swap_chip_core_main_blocks

        # aux_next at OOD_EVALUATIONS_PTR + 384
        exec.constants::ood_evaluations_ptr add.384
        exec.swap_two_4_word_blocks

        # aux_bus_boundary
        exec.swap_aux_bus_boundary_word
    end
end

#! Observes the auxiliary trace: draws aux randomness, reseeds with the aux trace commitment,
#! and absorbs aux trace boundary values into the transcript.
#!
#! For AIRs without an auxiliary trace, the implementation should be a no-op.
#!
#! Inputs:  [...]
#! Outputs: [...]
#!
#! Invocation: exec
pub proc observe_aux_trace
    exec.constants::get_procedure_digest_observe_aux_trace_ptr dynexec
end

#! Stores digests of dynamically executed procedures.
#!
#! Input: [D0, D1, D2, D3, D4, ...]
#! Output: [...]
pub proc store_dynamically_executed_procedures
    exec.constants::get_procedure_digest_observe_aux_trace_ptr mem_storew_le dropw
    exec.constants::get_procedure_digest_process_public_inputs_ptr mem_storew_le dropw
    exec.constants::get_procedure_digest_process_row_ood_evaluations_ptr mem_storew_le dropw
    exec.constants::get_procedure_digest_execute_constraint_evaluation_check_ptr mem_storew_le dropw
    exec.constants::get_procedure_digest_compute_deep_composition_polynomial_queries_ptr mem_storew_le dropw
    # => [...]
end