miden-core-lib 0.24.2

# ===== HELPER FUNCTIONS ==========================================================================

#! Asserts that both values at the top of the stack are u64 values.
#! The input values are assumed to be represented using 32 bit limbs, fails if they are not.
#! This takes 6 cycles.
proc u32assert4
    u32assert2
    movup.3
    movup.3
    u32assert2
    movup.3
    movup.3
end

# ===== ADDITION ==================================================================================

#! Performs addition of two unsigned 64 bit integers discarding the overflow.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_lo, b_hi, a_lo, a_hi, ...] -> [c_lo, c_hi, ...], where c = (a + b) % 2^64
#! This takes 6 cycles.
pub proc wrapping_add(b: u64, a: u64) -> u64
    exec.overflowing_add
    drop
end

#! Performs addition of two unsigned 64 bit integers preserving the overflow.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_lo, b_hi, a_lo, a_hi, ...] -> [overflow, c_lo, c_hi, ...], where c = (a + b) % 2^64
#! This takes 5 cycles.
pub proc overflowing_add(b: u64, a: u64) -> (i1, u64)
    movup.2             # => [a_lo, b_lo, b_hi, a_hi]
    u32widening_add     # => [sum_lo, carry, b_hi, a_hi]
    movdn.3             # => [carry, b_hi, a_hi, sum_lo]
    u32widening_add3    # => [sum_hi, overflow, sum_lo]
    movdn.2             # => [overflow, sum_lo, sum_hi]
end

#! Performs addition of two unsigned 64 bit integers preserving the overflow with sum on top.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_lo, b_hi, a_lo, a_hi, ...] -> [c_lo, c_hi, overflow, ...], where c = (a + b) % 2^64
#! This takes 6 cycles.
pub proc widening_add(b: u64, a: u64) -> (u64, i1)
    exec.overflowing_add
    movdn.2
end

# ===== SUBTRACTION ===============================================================================

#! Performs subtraction of two unsigned 64 bit integers discarding the overflow.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_lo, b_hi, a_lo, a_hi, ...] -> [c_lo, c_hi, ...], where c = (a - b) % 2^64
#! This takes 12 cycles.
pub proc wrapping_sub(b: u64, a: u64) -> u64
    movup.3 movup.3     # => [a_lo, a_hi, b_lo, b_hi]
    movup.2             # => [b_lo, a_lo, a_hi, b_hi]
    u32overflowing_sub  # => [borrow_lo, c_lo, a_hi, b_hi] (a_lo - b_lo)
    movup.2             # => [a_hi, borrow_lo, c_lo, b_hi]
    movup.3             # => [b_hi, a_hi, borrow_lo, c_lo]
    u32overflowing_sub  # => [borrow_hi, diff_hi, borrow_lo, c_lo] (a_hi - b_hi)
    drop                # => [diff_hi, borrow_lo, c_lo]
    swap                # => [borrow_lo, diff_hi, c_lo]
    u32overflowing_sub  # => [borrow2, c_hi, c_lo] (diff_hi - borrow_lo)
    drop                # => [c_hi, c_lo]
    swap                # => [c_lo, c_hi]
end

#! Performs subtraction of two unsigned 64 bit integers preserving the overflow.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_lo, b_hi, a_lo, a_hi, ...] -> [underflow, c_lo, c_hi, ...], where c = (a - b) % 2^64
#! This takes 14 cycles.
pub proc overflowing_sub(b: u64, a: u64) -> (i1, u64)
    movup.3 movup.3     # => [a_lo, a_hi, b_lo, b_hi]
    movup.2             # => [b_lo, a_lo, a_hi, b_hi]
    u32overflowing_sub  # => [borrow_lo, c_lo, a_hi, b_hi] (a_lo - b_lo)
    movup.2             # => [a_hi, borrow_lo, c_lo, b_hi]
    movup.3             # => [b_hi, a_hi, borrow_lo, c_lo]
    u32overflowing_sub  # => [borrow_hi, diff_hi, borrow_lo, c_lo] (a_hi - b_hi)
    swap                # => [diff_hi, borrow_hi, borrow_lo, c_lo]
    movup.2             # => [borrow_lo, diff_hi, borrow_hi, c_lo]
    u32overflowing_sub  # => [borrow2, c_hi, borrow_hi, c_lo] (diff_hi - borrow_lo)
    movup.2             # => [borrow_hi, borrow2, c_hi, c_lo]
    or                  # => [underflow, c_hi, c_lo]
    movup.2             # => [c_lo, underflow, c_hi]
    swap                # => [underflow, c_lo, c_hi]
end

# ===== MULTIPLICATION ============================================================================

#! Performs multiplication of two unsigned 64 bit integers discarding the overflow.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_lo, b_hi, a_lo, a_hi, ...] -> [c_lo, c_hi, ...], where c = (a * b) % 2^64
#! This takes 15 cycles.
pub proc wrapping_mul(b: u64, a: u64) -> u64
    dup.2               # => [a_lo, b_lo, b_hi, a_lo, a_hi]
    dup.1               # => [b_lo, a_lo, b_lo, b_hi, a_lo, a_hi]
    u32widening_mul  # => [p0_lo, p0_hi, b_lo, b_hi, a_lo, a_hi]
    swap                # => [p0_hi, p0_lo, b_lo, b_hi, a_lo, a_hi]
    movup.3             # => [b_hi, p0_hi, p0_lo, b_lo, a_lo, a_hi]
    movup.4             # => [a_lo, b_hi, p0_hi, p0_lo, b_lo, a_hi]
    u32widening_madd # => [t1_lo, t1_hi, p0_lo, b_lo, a_hi]
    swap                # => [t1_hi, t1_lo, p0_lo, b_lo, a_hi]
    drop                # => [t1_lo, p0_lo, b_lo, a_hi]
    movup.2             # => [b_lo, t1_lo, p0_lo, a_hi]
    movup.3             # => [a_hi, b_lo, t1_lo, p0_lo]
    u32widening_madd # => [t2_lo, t2_hi, p0_lo]
    swap                # => [t2_hi, t2_lo, p0_lo]
    drop                # => [t2_lo, p0_lo]
    swap                # => [p0_lo, t2_lo] = [c_lo, c_hi]
end

#! Performs multiplication of two unsigned 64 bit integers preserving the overflow flag.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_lo, b_hi, a_lo, a_hi, ...] -> [overflow, c_lo, c_hi, ...], where c = (a * b) % 2^64
#! and overflow = 1 iff a * b >= 2^64.
#! This takes 38 cycles.
pub proc overflowing_mul(b: u64, a: u64) -> (i1, u64)
    exec.widening_mul       # [c_lo, c_mid_lo, c_mid_hi, c_hi, ...]
    movup.3 movup.3         # [c_mid_hi, c_hi, c_lo, c_mid_lo, ...]
    u32or                   # [c_mid_hi | c_hi, c_lo, c_mid_lo, ...]
    neq.0                   # [overflow, c_lo, c_mid_lo, ...]
end

#! Performs multiplication of two unsigned 64 bit integers preserving the overflow.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_lo, b_hi, a_lo, a_hi, ...] -> [c_lo, c_mid_lo, c_mid_hi, c_hi, ...], where
#! c = a * b is represented as a 128-bit value split into 4 32-bit limbs.
#! This takes 23 cycles.
pub proc widening_mul(b: u64, a: u64) -> u128
    # c = a*b = a_lo*b_lo + (a_lo*b_hi + a_hi*b_lo)*2^32 + a_hi*b_hi*2^64
    # Partial products accumulate high-first on the stack, then reversew flips to LE.

    # => [b_lo, b_hi, a_lo, a_hi, ...]
    dup.0
    dup.3
    # => [a_lo, b_lo, b_lo, b_hi, a_lo, a_hi, ...]
    u32widening_mul
    # => [p0_lo, p0_hi, b_lo, b_hi, a_lo, a_hi, ...] where p0 = a_lo * b_lo
    swap
    # => [p0_hi, p0_lo, b_lo, b_hi, a_lo, a_hi, ...]
    dup.3
    movup.5
    # => [a_lo, b_hi, p0_hi, p0_lo, b_lo, b_hi, a_hi, ...]
    u32widening_madd
    # => [t1_lo, t1_hi, p0_lo, b_lo, b_hi, a_hi, ...] where t1 = a_lo * b_hi + p0_hi
    movup.3
    dup.5
    # => [a_hi, b_lo, t1_lo, t1_hi, p0_lo, b_hi, a_hi, ...]
    u32widening_madd
    # => [t2_lo, t2_hi, t1_hi, p0_lo, b_hi, a_hi, ...] where t2 = a_hi * b_lo + t1_lo
    swap
    # => [t2_hi, t2_lo, t1_hi, p0_lo, b_hi, a_hi, ...]
    movup.4
    movup.5
    # => [a_hi, b_hi, t2_hi, t2_lo, t1_hi, p0_lo, ...]
    u32widening_madd
    # => [t3_lo, t3_hi, t2_lo, t1_hi, p0_lo, ...] where t3 = a_hi * b_hi + t2_hi
    swap
    # => [t3_hi, t3_lo, t2_lo, t1_hi, p0_lo, ...]
    movup.3
    movup.2
    # => [t3_lo, t1_hi, t3_hi, t2_lo, p0_lo, ...]
    u32widening_add
    # => [sum, carry, t3_hi, t2_lo, p0_lo, ...] where sum = (t3_lo + t1_hi) mod 2^32
    movdn.2
    # => [carry, t3_hi, sum, t2_lo, p0_lo, ...]
    add
    # => [c_hi, c_mid_hi, c_mid_lo, c_lo, ...]
    reversew
    # => [c_lo, c_mid_lo, c_mid_hi, c_hi, ...]
end

# ===== COMPARISONS ===============================================================================

#! Performs less-than comparison of two unsigned 64 bit integers.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_lo, b_hi, a_lo, a_hi, ...] -> [c, ...], where c = 1 when a < b, and 0 otherwise.
#! This takes 13 cycles.
pub proc lt(b: u64, a: u64) -> i1
    movup.3 movup.3
    # => [a_lo, a_hi, b_lo, b_hi, ...]
    # Computes a < b, which is true iff:
    # - a_hi < b_hi, OR
    # - a_hi == b_hi AND a_lo < b_lo
    movup.2
    # => [b_lo, a_lo, a_hi, b_hi, ...]
    u32overflowing_sub
    # => [borrow_lo, diff_lo, a_hi, b_hi, ...] where diff_lo = a_lo - b_lo
    movdn.3
    # => [diff_lo, a_hi, b_hi, borrow_lo, ...]
    drop
    # => [a_hi, b_hi, borrow_lo, ...]
    swap
    # => [b_hi, a_hi, borrow_lo, ...]
    u32overflowing_sub
    # => [borrow_hi, diff_hi, borrow_lo, ...] where diff_hi = a_hi - b_hi
    swap
    # => [diff_hi, borrow_hi, borrow_lo, ...]
    eq.0
    # => [diff_hi==0, borrow_hi, borrow_lo, ...]
    movup.2
    # => [borrow_lo, diff_hi==0, borrow_hi, ...]
    and
    # => [(diff_hi==0 AND borrow_lo), borrow_hi, ...]
    or
    # => [result, ...] where result = borrow_hi OR (diff_hi==0 AND borrow_lo)
end

#! Performs greater-than comparison of two unsigned 64 bit integers.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_lo, b_hi, a_lo, a_hi, ...] -> [c, ...], where c = 1 when a > b, and 0 otherwise.
#! This takes 13 cycles.
pub proc gt(b: u64, a: u64) -> i1
    movup.3 movup.3
    # => [a_lo, a_hi, b_lo, b_hi, ...]
    # Computes a > b, which is equivalent to b < a, true iff:
    # - b_hi < a_hi, OR
    # - b_hi == a_hi AND b_lo < a_lo
    movup.2
    # => [b_lo, a_lo, a_hi, b_hi, ...]
    swap
    # => [a_lo, b_lo, a_hi, b_hi, ...]
    u32overflowing_sub
    # => [borrow_lo, diff_lo, a_hi, b_hi, ...] where diff_lo = b_lo - a_lo
    movdn.3
    # => [diff_lo, a_hi, b_hi, borrow_lo, ...]
    drop
    # => [a_hi, b_hi, borrow_lo, ...]
    u32overflowing_sub
    # => [borrow_hi, diff_hi, borrow_lo, ...] where diff_hi = b_hi - a_hi
    swap
    # => [diff_hi, borrow_hi, borrow_lo, ...]
    eq.0
    # => [diff_hi==0, borrow_hi, borrow_lo, ...]
    movup.2
    # => [borrow_lo, diff_hi==0, borrow_hi, ...]
    and
    # => [(diff_hi==0 AND borrow_lo), borrow_hi, ...]
    or
    # => [result, ...] where result = borrow_hi OR (diff_hi==0 AND borrow_lo)
end

#! Performs less-than-or-equal comparison of two unsigned 64 bit integers.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_lo, b_hi, a_lo, a_hi, ...] -> [c, ...], where c = 1 when a <= b, and 0 otherwise.
#! This takes 14 cycles.
pub proc lte(b: u64, a: u64) -> i1
    exec.gt
    not
end

#! Performs greater-than-or-equal comparison of two unsigned 64 bit integers.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_lo, b_hi, a_lo, a_hi, ...] -> [c, ...], where c = 1 when a >= b, and 0 otherwise.
#! This takes 13 cycles.
pub proc gte(b: u64, a: u64) -> i1
    exec.lt
    not
end

#! Performs equality comparison of two unsigned 64 bit integers.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_lo, b_hi, a_lo, a_hi, ...] -> [c, ...], where c = 1 when a == b, and 0 otherwise.
#! This takes 6 cycles.
pub proc eq(b: u64, a: u64) -> i1
    movup.2     # => [a_lo, b_lo, b_hi, a_hi]
    eq          # => [a_lo==b_lo, b_hi, a_hi]
    swap.2      # => [a_hi, b_hi, a_lo==b_lo]
    eq          # => [a_hi==b_hi, a_lo==b_lo]
    and         # => [result]
end

#! Performs inequality comparison of two unsigned 64 bit integers.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_lo, b_hi, a_lo, a_hi, ...] -> [c, ...], where c = 1 when a != b, and 0 otherwise.
#! This takes 8 cycles.
pub proc neq(b: u64, a: u64) -> i1
    movup.2     # => [a_lo, b_lo, b_hi, a_hi]
    neq         # => [a_lo!=b_lo, b_hi, a_hi]
    swap.2      # => [a_hi, b_hi, a_lo!=b_lo]
    neq         # => [a_hi!=b_hi, a_lo!=b_lo]
    or          # => [result]
end

#! Performs comparison to zero of an unsigned 64 bit integer.
#! The input value is assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [a_lo, a_hi, ...] -> [c, ...], where c = 1 when a == 0, and 0 otherwise.
#! This takes 4 cycles.
pub proc eqz(a: u64) -> i1
    eq.0        # => [a_lo==0, a_hi]
    swap        # => [a_hi, a_lo==0]
    eq.0        # => [a_hi==0, a_lo==0]
    and         # => [result]
end

#! Compares two unsigned 64 bit integers and drops the larger one from the stack.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_lo, b_hi, a_lo, a_hi, ...] -> [c_lo, c_hi, ...], where c = min(a, b).
#! This takes 27 cycles.
pub proc min(b: u64, a: u64) -> u64
    movup.3 movup.3
    # => [a_lo, a_hi, b_lo, b_hi, ...]
    dupw
    exec.gt
    movup.4
    movup.3
    dup.2
    cdrop
    movdn.3
    cdrop
end

#! Compares two unsigned 64 bit integers and drops the smaller one from the stack.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_lo, b_hi, a_lo, a_hi, ...] -> [c_lo, c_hi, ...], where c = max(a, b).
#! This takes 27 cycles.
pub proc max(b: u64, a: u64) -> u64
    movup.3 movup.3
    # => [a_lo, a_hi, b_lo, b_hi, ...]
    dupw
    exec.lt
    movup.4
    movup.3
    dup.2
    cdrop
    movdn.3
    cdrop
end

# ===== DIVISION ==================================================================================

const U64_DIV_EVENT = event("miden::core::math::u64::u64_div")

#! Performs division of two unsigned 64 bit integers discarding the remainder.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_lo, b_hi, a_lo, a_hi, ...] -> [c_lo, c_hi, ...], where c = a // b
#! This takes 56 cycles.
pub proc div(b: u64, a: u64) -> u64
    exec.divmod
    drop drop
end

# ===== MODULO OPERATION ==========================================================================

#! Performs modulo operation of two unsigned 64 bit integers.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_lo, b_hi, a_lo, a_hi, ...] -> [c_lo, c_hi, ...], where c = a % b
#! This takes 58 cycles.
pub proc mod(b: u64, a: u64) -> u64
    exec.divmod
    movup.2 drop
    movup.2 drop
end

# ===== DIVMOD OPERATION ==========================================================================

#! Performs divmod operation of two unsigned 64 bit integers.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_lo, b_hi, a_lo, a_hi, ...] -> [r_lo, r_hi, q_lo, q_hi, ...], where q = a / b, r = a % b.
#! This takes 54 cycles.
pub proc divmod(b: u64, a: u64) -> (remainder: u64, quotient: u64)
    # Input stack: [b_lo, b_hi, a_lo, a_hi, ...]
    # We verify: a = q * b + r, where q is quotient, r is remainder
    # Output: [r_lo, r_hi, q_lo, q_hi, ...]

    emit.U64_DIV_EVENT
    # Advice stack now has [q_hi, q_lo, r_hi, r_lo]
    # => [b_lo, b_hi, a_lo, a_hi, ...]

    adv_push adv_push
    # Pops from advice: first q_hi, then q_lo, pushes them
    # => [q_lo, q_hi, b_lo, b_hi, a_lo, a_hi, ...]
    u32assert2

    #==========================================================================
    # MULTIPLICATION BLOCK: Compute prod = q * b (must fit in 64 bits)
    #==========================================================================
    # Same as div procedure

    dup.2
    # => [b_lo, q_lo, q_hi, b_lo, b_hi, a_lo, a_hi, ...]
    dup.1
    # => [q_lo, b_lo, q_lo, q_hi, b_lo, b_hi, a_lo, a_hi, ...]
    u32widening_mul
    # outputs [lo, hi]: Stack: [p1_lo, p1_hi, q_lo, q_hi, b_lo, b_hi, a_lo, a_hi, ...]
    swap
    # => [p1_hi, p1_lo, q_lo, q_hi, b_lo, b_hi, a_lo, a_hi, ...]

    dup.5
    # => [b_hi, p1_hi, p1_lo, q_lo, q_hi, b_lo, b_hi, a_lo, a_hi, ...]
    dup.3
    # => [q_lo, b_hi, p1_hi, p1_lo, q_lo, q_hi, b_lo, b_hi, a_lo, a_hi, ...]
    u32widening_madd
    # outputs [lo, hi]: Stack: [p2_lo, p2_hi, p1_lo, q_lo, q_hi, b_lo, b_hi, a_lo, a_hi, ...]
    swap
    # => [p2_hi, p2_lo, p1_lo, q_lo, q_hi, b_lo, b_hi, a_lo, a_hi, ...]
    eq.0
    assert.err="comparison failed: divmod"
    # => [p2_lo, p1_lo, q_lo, q_hi, b_lo, b_hi, a_lo, a_hi, ...]

    dup.4
    # => [b_lo, p2_lo, p1_lo, q_lo, q_hi, b_lo, b_hi, a_lo, a_hi, ...]
    dup.4
    # => [q_hi, b_lo, p2_lo, p1_lo, q_lo, q_hi, b_lo, b_hi, a_lo, a_hi, ...]
    u32widening_madd
    # outputs [lo, hi]: Stack: [p3_lo, p3_hi, p1_lo, q_lo, q_hi, b_lo, b_hi, a_lo, a_hi, ...]
    swap
    # => [p3_hi, p3_lo, p1_lo, q_lo, q_hi, b_lo, b_hi, a_lo, a_hi, ...]
    eq.0
    assert.err="comparison failed: divmod"
    # => [p3_lo, p1_lo, q_lo, q_hi, b_lo, b_hi, a_lo, a_hi, ...]
    # prod_hi = p3_lo, prod_lo = p1_lo

    dup.5
    # => [b_hi, p3_lo, p1_lo, q_lo, q_hi, b_lo, b_hi, a_lo, a_hi, ...]
    dup.4
    # => [q_hi, b_hi, p3_lo, p1_lo, q_lo, q_hi, b_lo, b_hi, a_lo, a_hi, ...]
    mul
    eq.0
    assert.err="comparison failed: divmod"
    # => [prod_hi, prod_lo, q_lo, q_hi, b_lo, b_hi, a_lo, a_hi, ...]

    #==========================================================================
    # REMAINDER BLOCK: Get remainder and verify b > r
    #==========================================================================
    adv_push adv_push
    # Pops from advice: first r_hi, then r_lo
    # => [r_lo, r_hi, prod_hi, prod_lo, q_lo, q_hi, b_lo, b_hi, a_lo, a_hi, ...]
    u32assert2

    movup.6
    # => [b_lo, r_lo, r_hi, prod_hi, prod_lo, q_lo, q_hi, b_hi, a_lo, a_hi, ...]
    movup.7
    # => [b_hi, b_lo, r_lo, r_hi, prod_hi, prod_lo, q_lo, q_hi, a_lo, a_hi, ...]
    swap
    # => [b_lo, b_hi, r_lo, r_hi, prod_hi, prod_lo, q_lo, q_hi, a_lo, a_hi, ...]
    dup.3
    # => [r_hi, b_lo, b_hi, r_lo, r_hi, prod_hi, prod_lo, q_lo, q_hi, a_lo, a_hi, ...]
    dup.3
    # => [r_lo, r_hi, b_lo, b_hi, r_lo, r_hi, prod_hi, prod_lo, q_lo, q_hi, a_lo, a_hi, ...]
    movup.3
    # => [b_hi, r_lo, r_hi, b_lo, r_lo, r_hi, prod_hi, prod_lo, q_lo, q_hi, a_lo, a_hi, ...]
    movup.3
    # => [b_lo, b_hi, r_lo, r_hi, r_lo, r_hi, prod_hi, prod_lo, q_lo, q_hi, a_lo, a_hi, ...]
    exec.lt
    # Computes r < b, consumes both pairs
    # => [1, r_lo, r_hi, prod_hi, prod_lo, q_lo, q_hi, a_lo, a_hi, ...]
    assert.err="comparison failed: divmod"
    # => [r_lo, r_hi, prod_hi, prod_lo, q_lo, q_hi, a_lo, a_hi, ...]

    #==========================================================================
    # VERIFICATION BLOCK: Check prod + r = a
    #==========================================================================
    # We need: sum_lo = (prod_lo + r_lo) mod 2^32, sum_hi = prod_hi + r_hi + carry_lo
    # And verify: sum_lo == a_lo, sum_hi == a_hi
    # We also need to keep r_lo, r_hi on the stack for output

    # Current stack: [r_lo, r_hi, prod_hi, prod_lo, q_lo, q_hi, a_lo, a_hi, ...]

    # Duplicate r_lo for the add (we'll need it in output)
    dup.0
    # => [r_lo, r_lo, r_hi, prod_hi, prod_lo, q_lo, q_hi, a_lo, a_hi, ...]
    movup.4
    # => [prod_lo, r_lo, r_lo, r_hi, prod_hi, q_lo, q_hi, a_lo, a_hi, ...]
    u32widening_add
    # outputs [sum, carry]: computes prod_lo + r_lo
    # => [sum_lo, carry_lo, r_lo, r_hi, prod_hi, q_lo, q_hi, a_lo, a_hi, ...]

    # Now we need to compute: sum_hi = prod_hi + r_hi + carry_lo
    # Duplicate r_hi for the add (we'll need it in output)
    swap
    # => [carry_lo, sum_lo, r_lo, r_hi, prod_hi, q_lo, q_hi, a_lo, a_hi, ...]
    dup.3
    # => [r_hi, carry_lo, sum_lo, r_lo, r_hi, prod_hi, q_lo, q_hi, a_lo, a_hi, ...]
    movup.5
    # => [prod_hi, r_hi, carry_lo, sum_lo, r_lo, r_hi, q_lo, q_hi, a_lo, a_hi, ...]
    movup.2
    # => [carry_lo, prod_hi, r_hi, sum_lo, r_lo, r_hi, q_lo, q_hi, a_lo, a_hi, ...]
    u32widening_add3
    # outputs [sum, carry]: computes carry_lo + prod_hi + r_hi
    # => [sum_hi, carry_hi, sum_lo, r_lo, r_hi, q_lo, q_hi, a_lo, a_hi, ...]
    swap
    # => [carry_hi, sum_hi, sum_lo, r_lo, r_hi, q_lo, q_hi, a_lo, a_hi, ...]
    eq.0
    assert.err="comparison failed: divmod"
    # => [sum_hi, sum_lo, r_lo, r_hi, q_lo, q_hi, a_lo, a_hi, ...]

    # Verify sum_hi == a_hi
    movup.7
    # => [a_hi, sum_hi, sum_lo, r_lo, r_hi, q_lo, q_hi, a_lo, ...]
    assert_eq.err="comparison failed: divmod"
    # => [sum_lo, r_lo, r_hi, q_lo, q_hi, a_lo, ...]

    # Verify sum_lo == a_lo
    movup.5
    # => [a_lo, sum_lo, r_lo, r_hi, q_lo, q_hi, ...]
    assert_eq.err="comparison failed: divmod"
    # => [r_lo, r_hi, q_lo, q_hi, ...]

end

# ===== BITWISE OPERATIONS ========================================================================

#! Performs bitwise AND of two unsigned 64-bit integers.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_lo, b_hi, a_lo, a_hi, ...] -> [c_lo, c_hi, ...], where c = a AND b.
#! This takes 5 cycles.
pub proc and(b: u64, a: u64) -> u64
    movup.2     # => [a_lo, b_lo, b_hi, a_hi]
    u32and      # => [c_lo, b_hi, a_hi]
    swap.2      # => [a_hi, b_hi, c_lo]
    u32and      # => [c_hi, c_lo]
    swap        # => [c_lo, c_hi]
end

#! Performs bitwise OR of two unsigned 64 bit integers.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_lo, b_hi, a_lo, a_hi, ...] -> [c_lo, c_hi, ...], where c = a OR b.
#! This takes 5 cycles.
pub proc or(b: u64, a: u64) -> u64
    movup.2     # => [a_lo, b_lo, b_hi, a_hi]
    u32or       # => [c_lo, b_hi, a_hi]
    swap.2      # => [a_hi, b_hi, c_lo]
    u32or       # => [c_hi, c_lo]
    swap        # => [c_lo, c_hi]
end

#! Performs bitwise XOR of two unsigned 64 bit integers.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_lo, b_hi, a_lo, a_hi, ...] -> [c_lo, c_hi, ...], where c = a XOR b.
#! This takes 5 cycles.
pub proc xor(b: u64, a: u64) -> u64
    movup.2     # => [a_lo, b_lo, b_hi, a_hi]
    u32xor      # => [c_lo, b_hi, a_hi]
    swap.2      # => [a_hi, b_hi, c_lo]
    u32xor      # => [c_hi, c_lo]
    swap        # => [c_lo, c_hi]
end

#! Performs bitwise NOT of one unsigned 64 bit integer.
#! The input value is assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [a_lo, a_hi, ...] -> [c_lo, c_hi, ...], where c = !a (i.e., 2^64 - 1 - a).
#! This takes 4 cycles.
pub proc not(a: u64) -> u64
    u32not          # [!a_lo, a_hi, ...]
    swap            # [a_hi, !a_lo, ...]
    u32not          # [!a_hi, !a_lo, ...]
    swap            # [!a_lo, !a_hi, ...]
end

#! Performs left shift of one unsigned 64-bit integer.
#! The input value to be shifted is assumed to be represented using 32 bit limbs, but this is not checked.
#! The shift value n should be in the range [0, 64), otherwise it will result in an error.
#! Stack transition looks as follows:
#! [n, a_lo, a_hi, ...] -> [c_lo, c_hi, ...], where c = (a << n) mod 2^64.
#! This takes 21 cycles.
pub proc shl(n: u32, a: u64) -> u64
    pow2            # [2^n, a_lo, a_hi]
    u32split        # [pow_lo, pow_hi, a_lo, a_hi]
    movup.2         # [a_lo, pow_lo, pow_hi, a_hi]
    movup.3         # [a_hi, a_lo, pow_lo, pow_hi]
    swap            # [a_lo, a_hi, pow_lo, pow_hi]
    exec.wrapping_mul
end

#! Performs right shift of one unsigned 64-bit integer.
#! The input value to be shifted is assumed to be represented using 32 bit limbs, but this is not checked.
#! The shift value n should be in the range [0, 64), otherwise it will result in an error.
#! Stack transition looks as follows:
#! [n, a_lo, a_hi, ...] -> [c_lo, c_hi, ...], where c = a >> n.
#! This takes 60 or 61 cycles, depending on n.
pub proc shr(n: u32, a: u64) -> u64
    # ==========================================================================
    # RIGHT SHIFT: Computes a >> n where a is 64-bit and n is shift amount
    # Input:  [n, a_lo, a_hi, ...] where a = a_hi * 2^32 + a_lo
    # Output: [c_lo, c_hi, ...] where c = a >> n
    # ==========================================================================

    # --- STEP 1: Compute 2^n and split into hi/lo ---
    pow2
    # [2^b, a_lo, a_hi, ...]

    u32split swap
    # [pow_hi, pow_lo, a_lo, a_hi, ...]
    # For b=0: pow_hi=0, pow_lo=1
    # For b=1: pow_hi=0, pow_lo=2
    # For b=32: pow_hi=1, pow_lo=0

    # --- STEP 2: Compute divisor = pow_hi + pow_lo ---
    dup.1
    # [pow_lo, pow_hi, pow_lo, a_lo, a_hi, ...]

    add
    # [pow_hi+pow_lo, pow_lo, a_lo, a_hi, ...]
    # For b=0: divisor=1, pow_lo=1
    # For b=1: divisor=2, pow_lo=2

    # --- STEP 3: First division - a_hi / divisor ---
    movup.3
    # [a_hi, pow_hi+pow_lo, pow_lo, a_lo, ...]
    swap
    # [pow_hi+pow_lo, a_hi, pow_lo, a_lo, ...]
    # u32divmod: [divisor, dividend, ...] -> [remainder, quotient, ...]
    u32divmod
    swap
    # [quot1, rem1, pow_lo, a_lo, ...]
    # quot1 = a_hi / divisor
    # rem1 = a_hi % divisor

    # --- STEP 4: Rearrange for second division ---
    movup.3
    # [a_lo, quot1, rem1, pow_lo, ...]

    movup.3
    # [pow_lo, a_lo, quot1, rem1, ...]

    # --- STEP 5: Handle division by zero case (when pow_lo=0) ---
    dup
    # [pow_lo, pow_lo, a_lo, quot1, rem1, ...]

    eq.0
    # [is_zero, pow_lo, a_lo, quot1, rem1, ...]
    # is_zero = 1 if pow_lo==0, else 0

    # We want: adjusted_divisor = pow_lo - is_zero
    # This gives pow_lo when pow_lo>0, or -1 (0xFFFFFFFF) when pow_lo=0
    # u32overflowing_sub computes second - top = pow_lo - is_zero
    u32overflowing_sub
    # Computes: pow_lo - is_zero
    # [borrow, adjusted_div, a_lo, quot1, rem1, ...]
    # For pow_lo=1, is_zero=0: adjusted_div=1, borrow=0
    # For pow_lo=2, is_zero=0: adjusted_div=2, borrow=0
    # For pow_lo=0, is_zero=1: adjusted_div=0xFFFFFFFF, borrow=1

    not
    # [!borrow, adjusted_div, a_lo, quot1, rem1, ...]
    # !borrow = 1 when no underflow (normal case), 0 when underflow (b>=32 case)

    movdn.4
    # [adjusted_div, a_lo, quot1, rem1, !borrow, ...]

    dup
    # [adjusted_div, adjusted_div, a_lo, quot1, rem1, !borrow, ...]

    movdn.4
    # [adjusted_div, a_lo, quot1, rem1, adjusted_div, !borrow, ...]

    # --- STEP 6: Second division - a_lo / adjusted_div ---
    # => [adjusted_div, a_lo, quot1, rem1, adjusted_div, !borrow, ...]
    # u32divmod: [divisor, dividend, ...] -> [remainder, quotient, ...]
    u32divmod
    swap
    # [quot2, rem2, quot1, rem1, adjusted_div, !borrow, ...]
    # quot2 = a_lo / adjusted_div
    # rem2 = a_lo % adjusted_div

    swap drop
    # [quot2, quot1, rem1, adjusted_div, !borrow, ...]

    # --- STEP 7: Compute multiplier = (!borrow * 2^32) / adjusted_div ---
    push.4294967296
    # [2^32, quot2, quot1, rem1, adjusted_div, !borrow, ...]

    dup.5
    # [!borrow, 2^32, quot2, quot1, rem1, adjusted_div, !borrow, ...]

    mul
    # [!borrow*2^32, quot2, quot1, rem1, adjusted_div, !borrow, ...]
    # This is 2^32 when !borrow=1 (normal case), 0 when !borrow=0

    movup.4
    # [adjusted_div, !borrow*2^32, quot2, quot1, rem1, !borrow, ...]

    div
    # [multiplier, quot2, quot1, rem1, !borrow, ...]
    # multiplier = (!borrow * 2^32) / adjusted_div

    # --- STEP 8: Compute c_lo = rem1 * multiplier + quot2 ---
    movup.3
    # [rem1, multiplier, quot2, quot1, !borrow, ...]

    mul
    # [rem1*multiplier, quot2, quot1, !borrow, ...]

    add
    # [c_lo, quot1, !borrow, ...]
    # c_lo = rem1 * multiplier + quot2
    # This add is safe: when !borrow=1, multiplier = 2^32 / pow_lo and rem1 < pow_lo, so
    # rem1 * multiplier <= 2^32 - multiplier and adding quot2 < multiplier stays below 2^32.

    # --- STEP 9: Conditional swap to get final result ---
    dup.2
    # [!borrow, c_lo, quot1, !borrow, ...]

    cswap
    # if !borrow=1 (normal case): swap c_lo and quot1 -> [quot1, c_lo, !borrow, ...]
    # if !borrow=0 (b>=32 case): no swap -> [c_lo, quot1, !borrow, ...]
    # result in BE format with flag: [c_hi, c_lo, !borrow, ...]

    # when n >= 32, c_hi must be zero. multiply c_hi by !borrow to enforce this:
    # !borrow=1 keeps c_hi unchanged for n<32; !borrow=0 clears c_hi for n>=32.
    movup.2
    # [!borrow, c_hi, c_lo, ...]
    mul
    # [c_hi * !borrow, c_lo, ...]
    swap
    # [c_lo, c_hi, ...]
end

#! Performs left rotation of one unsigned 64-bit integer.
#! The input value to be rotated is assumed to be represented using 32 bit limbs, but this is not checked.
#! The rotation amount n should be in the range [0, 64), otherwise it will result in an error.
#! Stack transition looks as follows:
#! [n, a_lo, a_hi, ...] -> [c_lo, c_hi, ...], where c = a <<< n (rotate left).
#! This takes 46 cycles.
pub proc rotl(n: u32, a: u64) -> u64
    # ==========================================================================
    # LEFT ROTATION: Computes a <<< n (rotate left by n bits)
    # Input:  [n, a_lo, a_hi, ...] where a = a_hi * 2^32 + a_lo
    # Output: [c_lo, c_hi, ...] where c = a <<< n
    # ==========================================================================

    # --- STEP 1: Compute swap_flag = (31 - n) borrow, i.e., n > 31 ---
    push.31
    # [31, b, a_lo, a_hi, ...]

    dup.1
    # [b, 31, b, a_lo, a_hi, ...]
    u32overflowing_sub
    # Computes: 31 - b
    # [borrow, diff, b, a_lo, a_hi, ...]
    # borrow = 1 if b > 31, else 0

    swap
    # [diff, borrow, b, a_lo, a_hi, ...]

    drop
    # [borrow, b, a_lo, a_hi, ...]
    # borrow indicates if b > 31 (swap_flag)

    movdn.3
    # [b, a_lo, a_hi, borrow, ...]

    # --- STEP 2: Compute shift_amt = b & 31 and 2^shift_amt ---
    push.31
    # [31, b, a_lo, a_hi, borrow, ...]

    u32and
    # [b&31, a_lo, a_hi, borrow, ...]
    # shift_amt = b mod 32

    pow2
    # [2^shift_amt, a_lo, a_hi, borrow, ...]

    dup
    # [2^shift_amt, 2^shift_amt, a_lo, a_hi, borrow, ...]

    movup.2
    # [a_lo, 2^shift_amt, 2^shift_amt, a_hi, borrow, ...]

    # --- STEP 3: Shift the low limb: a_lo * 2^shift_amt ---
    u32widening_mul
    # outputs [lo, hi]: lo = (a_lo * 2^shift_amt) mod 2^32, hi = overflow
    # [lo1, hi1, 2^shift_amt, a_hi, borrow, ...]

    swap
    # [hi1, lo1, 2^shift_amt, a_hi, borrow, ...]

    # --- STEP 4: Shift the high limb: a_hi * 2^shift_amt + hi1 ---
    movup.3
    # [a_hi, hi1, lo1, 2^shift_amt, borrow, ...]

    movup.3
    # [2^shift_amt, a_hi, hi1, lo1, borrow, ...]

    # u32widening_madd: computes top * second + third
    # Input: [2^shift_amt, a_hi, hi1, ...]
    # Computes: 2^shift_amt * a_hi + hi1
    # outputs [lo, hi]
    u32widening_madd
    # [lo2, hi2, lo1, borrow, ...]
    # lo2 = (a_hi * 2^shift_amt + hi1) mod 2^32
    # hi2 = overflow (bits that wrap around)

    # --- STEP 5: Carry the overflow to the low bits ---
    # c_lo_computed = lo1 + hi2 (the rotation overflow wraps to low bits)
    movdn.2
    # [hi2, lo1, lo2, borrow, ...]

    add
    # [c_lo_computed, lo2, borrow, ...]

    # --- STEP 6: Conditionally swap based on b > 31 ---
    movup.2
    # [borrow, c_lo_computed, lo2, ...]
    cswap
    # borrow=1 (b>31):  [lo2, c_lo_computed, ...]
    # borrow=0 (b<=31): [c_lo_computed, lo2, ...]
    # In both cases the top is c_lo and the second is c_hi.
end

#! Performs right rotation of one unsigned 64-bit integer.
#! The input value to be rotated is assumed to be represented using 32 bit limbs, but this is not checked.
#! The rotation amount n should be in the range [0, 64), otherwise it will result in an error.
#! Stack transition looks as follows:
#! [n, a_lo, a_hi, ...] -> [c_lo, c_hi, ...], where c = a >>> n (rotate right).
#! This takes 60 cycles.
pub proc rotr(n: u32, a: u64) -> u64
    # ==========================================================================
    # RIGHT ROTATION: Computes a >>> n (rotate right by n bits)
    # Input:  [n, a_lo, a_hi, ...] where a = a_hi * 2^32 + a_lo
    # Output: [c_lo, c_hi, ...] where c = a >>> n
    #
    # Strategy: Rotate right by n = left rotate by (64 - n)
    #           For n in [0,64), we compute left shift by ((32 - (n&31)) & 31)
    #           and conditionally swap limbs when the rotation crosses the 32-bit boundary
    # ==========================================================================

    # --- STEP 1: Compute is_upper_half = (n > 31) ---
    push.31
    # [31, b, a_lo, a_hi, ...]

    dup.1
    # [b, 31, b, a_lo, a_hi, ...]

    u32overflowing_sub
    # [borrow, diff, b, a_lo, a_hi, ...]
    # borrow = 1 if b > 31, else 0

    swap
    # [diff, borrow, b, a_lo, a_hi, ...]

    drop
    # [borrow, b, a_lo, a_hi, ...]
    # borrow indicates whether n is in the upper half [32, 63]

    movdn.3
    # [b, a_lo, a_hi, borrow, ...]

    # --- STEP 2: Compute complement shift amount = (32 - (b & 31)) & 31 ---
    push.31
    # [31, b, a_lo, a_hi, borrow, ...]

    u32and
    # [b&31, a_lo, a_hi, borrow, ...]
    # shift_amt = b mod 32

    dup
    # [shift_amt, shift_amt, a_lo, a_hi, borrow, ...]
    neq.0
    not
    # [shift_amt == 0, shift_amt, a_lo, a_hi, borrow, ...]

    movdn.4
    # [shift_amt, a_lo, a_hi, borrow, shift_amt == 0, ...]

    push.32
    # [32, shift_amt, a_lo, a_hi, borrow, shift_amt == 0, ...]

    swap
    # [shift_amt, 32, a_lo, a_hi, borrow, shift_amt == 0, ...]
    u32wrapping_sub
    # [32-shift_amt, a_lo, a_hi, borrow, shift_amt == 0, ...]

    push.31
    # [31, 32-shift_amt, a_lo, a_hi, borrow, shift_amt == 0, ...]
    u32and
    # [(32-shift_amt)&31, a_lo, a_hi, borrow, shift_amt == 0, ...]
    # complement_shift = (32 - (b mod 32)) mod 32

    pow2
    # [2^complement_shift, a_lo, a_hi, borrow, shift_amt == 0, ...]

    dup
    # [2^complement_shift, 2^complement_shift, a_lo, a_hi, borrow, shift_amt == 0, ...]

    movup.2
    # [a_lo, 2^complement_shift, 2^complement_shift, a_hi, borrow, shift_amt == 0, ...]

    # --- STEP 3: Left shift low limb by complement amount ---
    u32widening_mul
    # [lo1, hi1, 2^complement_shift, a_hi, borrow, shift_amt == 0, ...]
    # lo1 = (a_lo * 2^complement_shift) mod 2^32
    # hi1 = overflow bits (shift into high)

    swap
    # [hi1, lo1, 2^complement_shift, a_hi, borrow, shift_amt == 0, ...]

    # --- STEP 4: Left shift high limb by complement amount and add overflow ---
    movup.3
    # [a_hi, hi1, lo1, 2^complement_shift, borrow, shift_amt == 0, ...]

    movup.3
    # [2^complement_shift, a_hi, hi1, lo1, borrow, shift_amt == 0, ...]

    u32widening_madd
    # [lo2, hi2, lo1, borrow, shift_amt == 0, ...]
    # lo2 = (a_hi * 2^complement_shift + hi1) mod 2^32
    # hi2 = overflow bits (wrap around to low)

    # --- STEP 5: Carry the overflow to the low bits ---
    # c_lo_computed = lo1 + hi2 (overflow wraps to low bits)
    # The add cannot overflow: hi2 < 2^complement_shift and
    # lo1 = (a_lo * 2^complement_shift) mod 2^32 is a multiple of 2^complement_shift,
    # so lo1 + hi2 < 2^32.
    movdn.2
    # [hi2, lo1, lo2, borrow, shift_amt == 0, ...]

    add
    # [c_lo_computed, lo2, borrow, shift_amt == 0, ...]

    # --- STEP 6: Conditionally swap based on (n > 31) == (n mod 32 == 0) ---
    movup.2
    # [borrow, c_lo_computed, lo2, shift_amt == 0, ...]

    movup.3
    # [shift_amt == 0, borrow, c_lo_computed, lo2, ...]
    eq
    # [swap_flag, c_lo_computed, lo2, ...]
    # swap_flag = 1 for n in [1, 32] and 0 for n in {0} U [33, 63]

    cswap
    # swap_flag=1: [lo2, c_lo_computed, ...]
    # swap_flag=0: [c_lo_computed, lo2, ...]
    # In both cases the top is c_lo and the second is c_hi.
end

#! Counts the number of leading zeros of one unsigned 64-bit integer.
#! The input value is assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [n_lo, n_hi, ...] -> [clz, ...], where clz is the number of leading zeros of value n.
#! This takes 48 cycles.
pub proc clz(n: u64) -> u8
    # Input: [n_lo, n_hi, ...]
    swap        # [n_hi, n_lo, ...]
    dup.0
    eq.0

    if.true    # if n_hi == 0
        drop
        u32clz
        add.32 # clz(n_lo) + 32
    else
        swap
        drop
        u32clz # clz(n_hi)
    end
end

#! Counts the number of trailing zeros of one unsigned 64-bit integer.
#! The input value is assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [n_lo, n_hi, ...] -> [ctz, ...], where ctz is the number of trailing zeros of value n.
#! This takes 41 cycles.
pub proc ctz(n: u64) -> u8
    # Input: [n_lo, n_hi, ...]
    dup.0
    eq.0

    if.true    # if n_lo == 0
        drop
        u32ctz
        add.32 # ctz(n_hi) + 32
    else
        swap
        drop
        u32ctz # ctz(n_lo)
    end
end

#! Counts the number of leading ones of one unsigned 64-bit integer.
#! The input value is assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [n_lo, n_hi, ...] -> [clo, ...], where clo is the number of leading ones of value n.
#! This takes 47 cycles.
pub proc clo(n: u64) -> u8
    # Input: [n_lo, n_hi, ...]
    swap        # [n_hi, n_lo, ...]
    dup.0
    eq.4294967295

    if.true    # if n_hi == 11111111111111111111111111111111
        drop
        u32clo
        add.32 # clo(n_lo) + 32
    else
        swap
        drop
        u32clo # clo(n_hi)
    end
end

#! Counts the number of trailing ones of one unsigned 64-bit integer.
#! The input value is assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [n_lo, n_hi, ...] -> [cto, ...], where cto is the number of trailing ones of value n.
#! This takes 40 cycles.
pub proc cto(n: u64) -> u8
    # Input: [n_lo, n_hi, ...]
    dup.0
    eq.4294967295

    if.true    # if n_lo == 11111111111111111111111111111111
        drop
        u32cto
        add.32 # cto(n_hi) + 32
    else
        swap
        drop
        u32cto # cto(n_lo)
    end
end