miden-stdlib 0.19.1

use.std::crypto::stark::constants
use.std::crypto::stark::random_coin
use.std::crypto::stark::public_inputs

use.std::crypto::hashes::rpo

# CONSTANTS
# =================================================================================================

# Number of fixed length public inputs with padding (in field elements)
# This is composed of the input/output operand stacks (16 * 2) and the program digest (4) and four
# zeros for padding to the next multiple of 4. Note that, then, the fixed length public inputs
# which are stored as extension field elements will be double-word aligned. 
const.NUM_FIXED_LEN_PUBLIC_INPUTS=40

# Number of variable length public input groups
const.NUM_VAR_LEN_PI_GROUPS=1

# Op label for kernel procedures table messages
const.KERNEL_OP_LABEL=48

# CONSTANTS GETTERS
# =================================================================================================

export.get_num_fixed_len_public_inputs
    push.NUM_FIXED_LEN_PUBLIC_INPUTS
end

# MAIN PROCEDURE
# =================================================================================================

#! Processes the public inputs.
#! 
#! This involves:
#!
#! 1. Loading from the advice stack the fixed-length public inputs and storing them in memory
#!    starting from the address pointed to by `public_inputs_address_ptr`.
#! 2. Loading from the advice stack the variable-length public inputs, storing them temporarily
#!    in memory, and then reducing them to an element in the challenge field using the auxiliary
#!    randomness. This reduced value is then used to impose a boundary condition on the relevant
#!    auxiliary column. 
#!
#! Note that the fixed length public inputs are stored as extension field elements while
#! the variable length ones are stored as base field elements.
#!
#! Note also that, while loading the above, we compute the hash of the public inputs. The hashing
#! starts with capacity registers of the hash function set to `C` that is the result of hashing
#! the proof context.
#!
#! The output D, that is the digest of the above hashing, is then used in order to reseed
#! the random coin.
#!
#! It is worth noting that:
#!
#! 1. Only the fixed-length public inputs are stored for the lifetime of the verification procedure.
#!    The variable-length public inputs are stored temporarily, as this simplifies the task of
#!    reducing them using the auxiliary randomness. On the other hand, the resulting values from
#!    the aforementioned reductions are stored right after the fixed-length public inputs. These
#!    are stored in a word-aligned manner and padded with zeros if needed.
#! 2. The public inputs address is computed in such a way so as we end up with the following
#!    memory layout:
#!
#!    [..., a_0...a_{m-1}, b_0...b_{n-1}, alpha0, alpha1, beta0, beta1, OOD-evaluations-start, ...]
#!
#!    where:
#!
#!    1. [a_0...a_{m-1}] are the fixed-length public inputs stored as extension field elements. This
#!       section is double-word-aligned.
#!    2. [b_0...b_{n-1}] are the results of reducing the variable length public inputs using
#!       auxiliary randomness. This section is word-aligned.
#!    3. [alpha0, alpha1, beta0, beta1] is the auxiliary randomness.
#!    4. `OOD-evaluations-start` is the first field element of the section containing the OOD
#!       evaluations.
#! 3. Note that for each bus message in a group in the variable length public inputs, each
#!    message is expected to be padded to the next multiple of 8 and provided in reverse order.
#!    This has the benefit of making the reduction using the auxiliary randomness more efficient
#!    using `horner_eval_base`.
#!
#!
#! Input: [C, ...]
#! Output: [...]
export.process_public_inputs
    # 1) Compute the address where the public inputs will be stored and store it.
    #    This also computes the address where the reduced variable-length public inputs will be stored.
    exec.get_num_fixed_len_public_inputs push.NUM_VAR_LEN_PI_GROUPS
    exec.public_inputs::compute_and_store_public_inputs_address
    # => [C, ...]

    # 2) Load the public inputs from the advice tape.
    #    This will also hash them so that we can absorb them into the transcript.
    exec.load_public_inputs
    # => [D, ...]

    # 3) Absorb into the transcript
    exec.random_coin::reseed
    # => [...]

    # 4) Reduce the variable-length public inputs using randomness.
    exec.reduce_variable_length_public_inputs
    # => [...]
end

# HELPER PROCEDURES
# =================================================================================================

#! Loads from the advice stack the public inputs and stores them in memory starting from address
#! pointed to by `public_inputs_address_ptr`.
#!
#! Note that the public inputs are stored as extension field elements.
#!
#! In parallel, it computes the hash of the public inputs being loaded. The hashing starts with
#! capacity registers of the hash function set to `C` resulting from hashing the proof context.
#! The output D is the digest of the hashing of the public inputs.
#!
#! Inputs:  [C, ...]
#! Outputs: [D, ...]
proc.load_public_inputs
    # 1) Load and hash the fixed length public inputs
    
    exec.constants::public_inputs_address_ptr mem_load
    movdn.4
    padw padw
    repeat.5
        exec.public_inputs::load_base_store_extension_double_word
        hperm
    end

    # 2) Load and hash the variable length public inputs

    ## a) Compute the number of base field elements in total in the variable length public inputs
    exec.constants::num_public_inputs_ptr mem_load
    exec.get_num_fixed_len_public_inputs
    sub
    # => [num_var_len_pi, R2, R1, C, ptr, ...]

    ## b) Compute the number of hash iteration needed to hash the variable length public inputs.
    ##    We also check the double-word alignment.
    u32divmod.8
    # => [rem, num_iter, R2, R1, C, ptr, ...]
    push.0 assert_eq
    # => [num_iter, R2, R1, C, ptr, ...]
    
    ## c) Prepare the stack for hashing
    movdn.13
    # => [R2, R1, C, ptr, num_iter, ...]
    dup.13 sub.1 swap.14
    push.0 neq
    # => [(num_iter == 0), R2, R1, C, ptr, num_iter - 1, ...]

    ## d) Hash the variable length public inputs
    while.true
        adv_pipe
        hperm
        # => [R2, R1, C, ptr, num_iter, ...]
        dup.13 sub.1 swap.14
        push.0 neq
    end
    # => [R2, R1, C, ptr, num_iter, ...]

    # 3) Return the final digest
    exec.rpo::squeeze_digest
    # => [D, ptr, num_iter, ...] where D = R1 the digest
    movup.4 drop
    movup.4 drop
    # => [D, ...]
end

#! Reduces the variable-length public inputs using the auxiliary randomness.
#!
#! The procedure non-deterministically loads the auxiliary randomness from the advice tape and
#! stores it at `aux_rand_nd_ptr` so that it can be later checked for correctness. After this,
#! the procedure uses the auxiliary randomness in order to reduce the variable-length public
#! inputs to a single element in the challenge field. The resulting values are then stored
#! contiguously after the fixed-length public inputs.
#!
#! Currently, the only variable-length public inputs are the kernel procedure digests.
#!
#! Input: 
#!      - Operand stack: [...]
#!      - Advice stack: [beta0, beta1, alpha0, alpha1, var_len_pi_1_len, ..., var_len_pi_k_len, ...]
#! Output: [...]
proc.reduce_variable_length_public_inputs
    # 1) Load the auxiliary randomness i.e., alpha and beta
    #    We store them as [beta0, beta1, alpha0, alpha1] since `horner_eval_ext` requires memory
    #    word-alignment.
    adv_push.4
    exec.constants::aux_rand_nd_ptr mem_storew_be
    # => [alpha1, alpha0, beta1, beta0, ...]
    dropw
    # => [...]

    # 2) Get the pointer to the variable-length public inputs.
    #    This is also the pointer to the first address at which we will store the results of
    #    the reductions.
    exec.constants::variable_length_public_inputs_address_ptr mem_load
    dup
    # => [next_var_len_pub_inputs_ptr, var_len_pub_inputs_res_ptr, ...] where
    # `next_var_len_pub_inputs_ptr` points to the next chunk of variable public inputs to be reduced,
    # and `var_len_pub_inputs_res_ptr` points to the next available memory location where the result
    # of the reduction can be stored.
    # Note that, as mentioned in the top of this module, the variable-length public inputs are only
    # stored temporarily and they will be over-written by, among other data, the result of reducing
    # the variable public inputs.
    
    adv_push.1 exec.reduce_kernel_digests
    # => [next_var_len_pub_inputs_ptr, var_len_pub_inputs_res_ptr, ...]

    # 3) Clean up the stack.
    drop drop
    # => [...]
end

#! Reduces the kernel procedures digests using auxiliary randomness.
#!
#! Inputs:  [num_ker_procedures, digests_ptr]
#! Outputs: [next_ptr]
#!
#! where `digests_ptr` is a pointer to the kernel procedures digests and `next_ptr` is a pointer to
#! the start of the next group of variable public inputs to be reduced, if such a group exists.
proc.reduce_kernel_digests
    # Assert that the number of kernel procedures is at most 1023
    dup u32lt.1024 assert

    # Store number of kernel procedures digests
    push.0.0 dup.2
    exec.constants::tmp1 mem_storew_be
    # => [num_ker_procedures, 0, 0, num_ker_procedures, digests_ptr, ...]

    # Load alpha
    exec.constants::aux_rand_nd_ptr mem_loadw_be
    # => [alpha1, alpha0, beta1, beta0, digests_ptr, ...]

    # We will keep [beta0, beta1, alpha0 + op_label, alpha1] on the stack so that we can compute
    # the final result, where op_label is a unique label to domain separate the interaction with
    # the chiplets` bus.
    # The final result is then computed as:
    #
    #   alpha + op_label * beta^0 + beta * (r_0 * beta^0 + r_1 * beta^1 + r_2 * beta^2 + r_3 * beta^3)
    swap
    push.KERNEL_OP_LABEL
    add
    swap
    # => [alpha1, alpha0 + op_label, beta1, beta0, digests_ptr, ...]

    # Push the `horner_eval_ext` accumulator
    push.0.0
    # => [acc1, acc0, alpha1, alpha0 + op_label, beta1, beta0, digests_ptr, ...]

    # Push the pointer to the evaluation point beta
    exec.constants::aux_rand_nd_ptr
    # => [beta_ptr, acc1, acc0, alpha1, alpha0 + op_label, beta1, beta0,  digests_ptr, ...]

    # Get the pointer to kernel procedures digests
    movup.7
    # => [digests_ptr, beta_ptr, acc1, acc0, alpha1, alpha0 + op_label, beta1, beta0,  ...]

    # Set up the stack for `mem_stream` + `horner_eval_ext`
    swapw
    padw padw
    # => [Y, Y, alpha1, alpha0 + op_label, beta1, beta0,  digests_ptr, beta_ptr, acc1, acc0, ...]
    # where `Y` is a garbage word.

    # Build a boolean flag for the expression `num_ker_procedures > 0`.
    # This will be used to drive the loop reducing the digests
    exec.constants::tmp1 mem_loadw_be dup
    push.0
    neq

    while.true
        # Compute `acc = ∑ {0≤i<4} digest_ptr[i] ⋅ βⁱ`
        #
        # Since the width of the digests, plus the op_label, is less than 8, we only need
        # to loop once
        repeat.1
            mem_stream
            horner_eval_base
        end
        # => [Y, Y, alpha1, alpha0 + op_label, beta1, beta0, digests_ptr, beta_ptr, acc1, acc0, ...]

        swapdw
        # => [alpha1, alpha0 + op_label, beta1, beta0, digests_ptr, beta_ptr, acc1, acc0, Y, Y, ...]

        movup.7 movup.7
        # => [acc1, acc0, alpha1, alpha0 + op_label, beta1, beta0, digests_ptr, beta_ptr, Y, Y, ...]
        
        # Compute `tmp = β ⋅ acc =  ∑ {0≤i<4} digest_ptr[i] ⋅ βⁱ⁺¹`
        dup.5 dup.5
        # => [beta1, beta0, acc1, acc0, alpha1, alpha0 + op_label, beta1, beta0, digests_ptr, beta_ptr, Y, Y, ...]
        ext2mul
        # => [tmp1, tmp0, alpha1, alpha0 + op_label, beta1, beta0, digests_ptr, beta_ptr, Y, Y, ...]

        # Compute `term = α + op_label + ∑{i = 0..4} digest_ptr[i] ⋅ βⁱ⁺¹`
        dup.3 dup.3
        ext2add
        # => [term1, term0, alpha1, alpha0 + op_label, beta1, beta0, digests_ptr, beta_ptr, Y, Y, ...]
  
        movdn.15
        movdn.15
        # => [alpha1, alpha0 + op_label, beta1, beta0, digests_ptr, beta_ptr, Y, Y, term1, term0, ...]

        push.0 movdn.6
        push.0 movdn.6
        # => [alpha1, alpha0 + op_label, beta1, beta0, digests_ptr, beta_ptr, 0, 0, Y, Y, term1, term0, ...]
 
        swapdw
        # => [Y, Y, alpha1, alpha0 + op_label, beta1, beta0, digests_ptr, beta_ptr, 0, 0, term1, term0, ...]

        # Subtract 1 from num_ker_digests
        exec.constants::tmp1 mem_loadw_be sub.1
        exec.constants::tmp1 mem_storew_be

        # Compute flag to decide on exiting the loop
        dup
        push.0
        neq
    end
    # => [Y, Y, alpha1, alpha0 + op_label, beta1, beta0, digests_ptr, beta_ptr, 0, 0, term1, term0, ...]

    dropw dropw dropw
    # => [digests_ptr, beta_ptr, 0, 0, term1, term0, ...]
    dup exec.constants::tmp2 mem_store
    exec.constants::tmp1 mem_loadw_be drop drop drop

    # We now need to multiply all of the reduced values
    # We push the multiplicative identity element, the number of elements to multiply, and a boolean
    # flag to enter or not the loop
    push.1.0
    movup.2
    dup
    push.0
    neq
    # => [loop, n, acc1, acc0, term1_1, term1_0, ..., termn_1, termn_0, ...]

    while.true
        sub.1 movdn.4
        # => [acc1, acc0, term1_1, term1_0, n - 1, ..., termn_1, termn_0, ...]
        ext2mul
        # => [acc1', acc0', n - 1, ..., termn_1, termn_0, ...]
        movup.2
        dup
        push.0
        neq
        # => [loop, n - 1, acc1', acc0', term1_1, term1_0, ..., termn_1, termn_0, ...]
    end
    # => [0, prod1, prod0, ...] where prod is the resulting product

    # since we are initializing the bus with "requests", we should invert the reduced result
    drop
    ext2inv
    # => [inv_prod1, inv_prod0, ...]

    # Store the result at `digests_ptr` as `[inv_prod0, inv_prod1, 0, 0]`
    # Note that `digests_ptr` points to the group that we just reduced above and hence it is safe
    # to overwrite it with the result.
    exec.constants::tmp2 mem_load movdn.2
    # => [inv_prod1, inv_prod0, digests_ptr, ...]
    push.0.0
    # => [0, 0, inv_prod1, inv_prod0, digests_ptr, var_len_pub_inputs_res_ptr, ...]
    dup.5 add.4 swap.6
    mem_storew_be
    dropw
    # => [digests_ptr, var_len_pub_inputs_res_ptr, ...]
end