# ===== HELPER FUNCTIONS ==========================================================================
#! Asserts that both values at the top of the stack are u64 values.
#! The input values are assumed to be represented using 32 bit limbs, fails if they are not.
#! This takes 6 cycles.
proc u32assert4
u32assert2
movup.3
movup.3
u32assert2
movup.3
movup.3
end
# ===== ADDITION ==================================================================================
#! Performs addition of two unsigned 64 bit integers discarding the overflow.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_lo, b_hi, a_lo, a_hi, ...] -> [c_lo, c_hi, ...], where c = (a + b) % 2^64
#! This takes 6 cycles.
pub proc wrapping_add(b: u64, a: u64) -> u64
exec.overflowing_add
drop
end
#! Performs addition of two unsigned 64 bit integers preserving the overflow.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_lo, b_hi, a_lo, a_hi, ...] -> [overflow, c_lo, c_hi, ...], where c = (a + b) % 2^64
#! This takes 5 cycles.
pub proc overflowing_add(b: u64, a: u64) -> (i1, u64)
movup.2 # => [a_lo, b_lo, b_hi, a_hi]
u32widening_add # => [sum_lo, carry, b_hi, a_hi]
movdn.3 # => [carry, b_hi, a_hi, sum_lo]
u32widening_add3 # => [sum_hi, overflow, sum_lo]
movdn.2 # => [overflow, sum_lo, sum_hi]
end
#! Performs addition of two unsigned 64 bit integers preserving the overflow with sum on top.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_lo, b_hi, a_lo, a_hi, ...] -> [c_lo, c_hi, overflow, ...], where c = (a + b) % 2^64
#! This takes 6 cycles.
pub proc widening_add(b: u64, a: u64) -> (u64, i1)
exec.overflowing_add
movdn.2
end
# ===== SUBTRACTION ===============================================================================
#! Performs subtraction of two unsigned 64 bit integers discarding the overflow.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_lo, b_hi, a_lo, a_hi, ...] -> [c_lo, c_hi, ...], where c = (a - b) % 2^64
#! This takes 12 cycles.
pub proc wrapping_sub(b: u64, a: u64) -> u64
movup.3 movup.3 # => [a_lo, a_hi, b_lo, b_hi]
movup.2 # => [b_lo, a_lo, a_hi, b_hi]
u32overflowing_sub # => [borrow_lo, c_lo, a_hi, b_hi] (a_lo - b_lo)
movup.2 # => [a_hi, borrow_lo, c_lo, b_hi]
movup.3 # => [b_hi, a_hi, borrow_lo, c_lo]
u32overflowing_sub # => [borrow_hi, diff_hi, borrow_lo, c_lo] (a_hi - b_hi)
drop # => [diff_hi, borrow_lo, c_lo]
swap # => [borrow_lo, diff_hi, c_lo]
u32overflowing_sub # => [borrow2, c_hi, c_lo] (diff_hi - borrow_lo)
drop # => [c_hi, c_lo]
swap # => [c_lo, c_hi]
end
#! Performs subtraction of two unsigned 64 bit integers preserving the overflow.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_lo, b_hi, a_lo, a_hi, ...] -> [underflow, c_lo, c_hi, ...], where c = (a - b) % 2^64
#! This takes 14 cycles.
pub proc overflowing_sub(b: u64, a: u64) -> (i1, u64)
movup.3 movup.3 # => [a_lo, a_hi, b_lo, b_hi]
movup.2 # => [b_lo, a_lo, a_hi, b_hi]
u32overflowing_sub # => [borrow_lo, c_lo, a_hi, b_hi] (a_lo - b_lo)
movup.2 # => [a_hi, borrow_lo, c_lo, b_hi]
movup.3 # => [b_hi, a_hi, borrow_lo, c_lo]
u32overflowing_sub # => [borrow_hi, diff_hi, borrow_lo, c_lo] (a_hi - b_hi)
swap # => [diff_hi, borrow_hi, borrow_lo, c_lo]
movup.2 # => [borrow_lo, diff_hi, borrow_hi, c_lo]
u32overflowing_sub # => [borrow2, c_hi, borrow_hi, c_lo] (diff_hi - borrow_lo)
movup.2 # => [borrow_hi, borrow2, c_hi, c_lo]
or # => [underflow, c_hi, c_lo]
movup.2 # => [c_lo, underflow, c_hi]
swap # => [underflow, c_lo, c_hi]
end
# ===== MULTIPLICATION ============================================================================
#! Performs multiplication of two unsigned 64 bit integers discarding the overflow.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_lo, b_hi, a_lo, a_hi, ...] -> [c_lo, c_hi, ...], where c = (a * b) % 2^64
#! This takes 15 cycles.
pub proc wrapping_mul(b: u64, a: u64) -> u64
dup.2 # => [a_lo, b_lo, b_hi, a_lo, a_hi]
dup.1 # => [b_lo, a_lo, b_lo, b_hi, a_lo, a_hi]
u32widening_mul # => [p0_lo, p0_hi, b_lo, b_hi, a_lo, a_hi]
swap # => [p0_hi, p0_lo, b_lo, b_hi, a_lo, a_hi]
movup.3 # => [b_hi, p0_hi, p0_lo, b_lo, a_lo, a_hi]
movup.4 # => [a_lo, b_hi, p0_hi, p0_lo, b_lo, a_hi]
u32widening_madd # => [t1_lo, t1_hi, p0_lo, b_lo, a_hi]
swap # => [t1_hi, t1_lo, p0_lo, b_lo, a_hi]
drop # => [t1_lo, p0_lo, b_lo, a_hi]
movup.2 # => [b_lo, t1_lo, p0_lo, a_hi]
movup.3 # => [a_hi, b_lo, t1_lo, p0_lo]
u32widening_madd # => [t2_lo, t2_hi, p0_lo]
swap # => [t2_hi, t2_lo, p0_lo]
drop # => [t2_lo, p0_lo]
swap # => [p0_lo, t2_lo] = [c_lo, c_hi]
end
#! Performs multiplication of two unsigned 64 bit integers preserving the overflow flag.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_lo, b_hi, a_lo, a_hi, ...] -> [overflow, c_lo, c_hi, ...], where c = (a * b) % 2^64
#! and overflow = 1 iff a * b >= 2^64.
#! This takes 38 cycles.
pub proc overflowing_mul(b: u64, a: u64) -> (i1, u64)
exec.widening_mul # [c_lo, c_mid_lo, c_mid_hi, c_hi, ...]
movup.3 movup.3 # [c_mid_hi, c_hi, c_lo, c_mid_lo, ...]
u32or # [c_mid_hi | c_hi, c_lo, c_mid_lo, ...]
neq.0 # [overflow, c_lo, c_mid_lo, ...]
end
#! Performs multiplication of two unsigned 64 bit integers preserving the overflow.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_lo, b_hi, a_lo, a_hi, ...] -> [c_lo, c_mid_lo, c_mid_hi, c_hi, ...], where
#! c = a * b is represented as a 128-bit value split into 4 32-bit limbs.
#! This takes 23 cycles.
pub proc widening_mul(b: u64, a: u64) -> u128
# c = a*b = a_lo*b_lo + (a_lo*b_hi + a_hi*b_lo)*2^32 + a_hi*b_hi*2^64
# Partial products accumulate high-first on the stack, then reversew flips to LE.
# => [b_lo, b_hi, a_lo, a_hi, ...]
dup.0
dup.3
# => [a_lo, b_lo, b_lo, b_hi, a_lo, a_hi, ...]
u32widening_mul
# => [p0_lo, p0_hi, b_lo, b_hi, a_lo, a_hi, ...] where p0 = a_lo * b_lo
swap
# => [p0_hi, p0_lo, b_lo, b_hi, a_lo, a_hi, ...]
dup.3
movup.5
# => [a_lo, b_hi, p0_hi, p0_lo, b_lo, b_hi, a_hi, ...]
u32widening_madd
# => [t1_lo, t1_hi, p0_lo, b_lo, b_hi, a_hi, ...] where t1 = a_lo * b_hi + p0_hi
movup.3
dup.5
# => [a_hi, b_lo, t1_lo, t1_hi, p0_lo, b_hi, a_hi, ...]
u32widening_madd
# => [t2_lo, t2_hi, t1_hi, p0_lo, b_hi, a_hi, ...] where t2 = a_hi * b_lo + t1_lo
swap
# => [t2_hi, t2_lo, t1_hi, p0_lo, b_hi, a_hi, ...]
movup.4
movup.5
# => [a_hi, b_hi, t2_hi, t2_lo, t1_hi, p0_lo, ...]
u32widening_madd
# => [t3_lo, t3_hi, t2_lo, t1_hi, p0_lo, ...] where t3 = a_hi * b_hi + t2_hi
swap
# => [t3_hi, t3_lo, t2_lo, t1_hi, p0_lo, ...]
movup.3
movup.2
# => [t3_lo, t1_hi, t3_hi, t2_lo, p0_lo, ...]
u32widening_add
# => [sum, carry, t3_hi, t2_lo, p0_lo, ...] where sum = (t3_lo + t1_hi) mod 2^32
movdn.2
# => [carry, t3_hi, sum, t2_lo, p0_lo, ...]
add
# => [c_hi, c_mid_hi, c_mid_lo, c_lo, ...]
reversew
# => [c_lo, c_mid_lo, c_mid_hi, c_hi, ...]
end
# ===== COMPARISONS ===============================================================================
#! Performs less-than comparison of two unsigned 64 bit integers.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_lo, b_hi, a_lo, a_hi, ...] -> [c, ...], where c = 1 when a < b, and 0 otherwise.
#! This takes 13 cycles.
pub proc lt(b: u64, a: u64) -> i1
movup.3 movup.3
# => [a_lo, a_hi, b_lo, b_hi, ...]
# Computes a < b, which is true iff:
# - a_hi < b_hi, OR
# - a_hi == b_hi AND a_lo < b_lo
movup.2
# => [b_lo, a_lo, a_hi, b_hi, ...]
u32overflowing_sub
# => [borrow_lo, diff_lo, a_hi, b_hi, ...] where diff_lo = a_lo - b_lo
movdn.3
# => [diff_lo, a_hi, b_hi, borrow_lo, ...]
drop
# => [a_hi, b_hi, borrow_lo, ...]
swap
# => [b_hi, a_hi, borrow_lo, ...]
u32overflowing_sub
# => [borrow_hi, diff_hi, borrow_lo, ...] where diff_hi = a_hi - b_hi
swap
# => [diff_hi, borrow_hi, borrow_lo, ...]
eq.0
# => [diff_hi==0, borrow_hi, borrow_lo, ...]
movup.2
# => [borrow_lo, diff_hi==0, borrow_hi, ...]
and
# => [(diff_hi==0 AND borrow_lo), borrow_hi, ...]
or
# => [result, ...] where result = borrow_hi OR (diff_hi==0 AND borrow_lo)
end
#! Performs greater-than comparison of two unsigned 64 bit integers.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_lo, b_hi, a_lo, a_hi, ...] -> [c, ...], where c = 1 when a > b, and 0 otherwise.
#! This takes 13 cycles.
pub proc gt(b: u64, a: u64) -> i1
movup.3 movup.3
# => [a_lo, a_hi, b_lo, b_hi, ...]
# Computes a > b, which is equivalent to b < a, true iff:
# - b_hi < a_hi, OR
# - b_hi == a_hi AND b_lo < a_lo
movup.2
# => [b_lo, a_lo, a_hi, b_hi, ...]
swap
# => [a_lo, b_lo, a_hi, b_hi, ...]
u32overflowing_sub
# => [borrow_lo, diff_lo, a_hi, b_hi, ...] where diff_lo = b_lo - a_lo
movdn.3
# => [diff_lo, a_hi, b_hi, borrow_lo, ...]
drop
# => [a_hi, b_hi, borrow_lo, ...]
u32overflowing_sub
# => [borrow_hi, diff_hi, borrow_lo, ...] where diff_hi = b_hi - a_hi
swap
# => [diff_hi, borrow_hi, borrow_lo, ...]
eq.0
# => [diff_hi==0, borrow_hi, borrow_lo, ...]
movup.2
# => [borrow_lo, diff_hi==0, borrow_hi, ...]
and
# => [(diff_hi==0 AND borrow_lo), borrow_hi, ...]
or
# => [result, ...] where result = borrow_hi OR (diff_hi==0 AND borrow_lo)
end
#! Performs less-than-or-equal comparison of two unsigned 64 bit integers.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_lo, b_hi, a_lo, a_hi, ...] -> [c, ...], where c = 1 when a <= b, and 0 otherwise.
#! This takes 14 cycles.
pub proc lte(b: u64, a: u64) -> i1
exec.gt
not
end
#! Performs greater-than-or-equal comparison of two unsigned 64 bit integers.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_lo, b_hi, a_lo, a_hi, ...] -> [c, ...], where c = 1 when a >= b, and 0 otherwise.
#! This takes 13 cycles.
pub proc gte(b: u64, a: u64) -> i1
exec.lt
not
end
#! Performs equality comparison of two unsigned 64 bit integers.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_lo, b_hi, a_lo, a_hi, ...] -> [c, ...], where c = 1 when a == b, and 0 otherwise.
#! This takes 6 cycles.
pub proc eq(b: u64, a: u64) -> i1
movup.2 # => [a_lo, b_lo, b_hi, a_hi]
eq # => [a_lo==b_lo, b_hi, a_hi]
swap.2 # => [a_hi, b_hi, a_lo==b_lo]
eq # => [a_hi==b_hi, a_lo==b_lo]
and # => [result]
end
#! Performs inequality comparison of two unsigned 64 bit integers.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_lo, b_hi, a_lo, a_hi, ...] -> [c, ...], where c = 1 when a != b, and 0 otherwise.
#! This takes 8 cycles.
pub proc neq(b: u64, a: u64) -> i1
movup.2 # => [a_lo, b_lo, b_hi, a_hi]
neq # => [a_lo!=b_lo, b_hi, a_hi]
swap.2 # => [a_hi, b_hi, a_lo!=b_lo]
neq # => [a_hi!=b_hi, a_lo!=b_lo]
or # => [result]
end
#! Performs comparison to zero of an unsigned 64 bit integer.
#! The input value is assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [a_lo, a_hi, ...] -> [c, ...], where c = 1 when a == 0, and 0 otherwise.
#! This takes 4 cycles.
pub proc eqz(a: u64) -> i1
eq.0 # => [a_lo==0, a_hi]
swap # => [a_hi, a_lo==0]
eq.0 # => [a_hi==0, a_lo==0]
and # => [result]
end
#! Compares two unsigned 64 bit integers and drops the larger one from the stack.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_lo, b_hi, a_lo, a_hi, ...] -> [c_lo, c_hi, ...], where c = min(a, b).
#! This takes 27 cycles.
pub proc min(b: u64, a: u64) -> u64
movup.3 movup.3
# => [a_lo, a_hi, b_lo, b_hi, ...]
dupw
exec.gt
movup.4
movup.3
dup.2
cdrop
movdn.3
cdrop
end
#! Compares two unsigned 64 bit integers and drops the smaller one from the stack.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_lo, b_hi, a_lo, a_hi, ...] -> [c_lo, c_hi, ...], where c = max(a, b).
#! This takes 27 cycles.
pub proc max(b: u64, a: u64) -> u64
movup.3 movup.3
# => [a_lo, a_hi, b_lo, b_hi, ...]
dupw
exec.lt
movup.4
movup.3
dup.2
cdrop
movdn.3
cdrop
end
# ===== DIVISION ==================================================================================
const U64_DIV_EVENT = event("miden::core::math::u64::u64_div")
#! Performs division of two unsigned 64 bit integers discarding the remainder.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_lo, b_hi, a_lo, a_hi, ...] -> [c_lo, c_hi, ...], where c = a // b
#! This takes 56 cycles.
pub proc div(b: u64, a: u64) -> u64
exec.divmod
drop drop
end
# ===== MODULO OPERATION ==========================================================================
#! Performs modulo operation of two unsigned 64 bit integers.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_lo, b_hi, a_lo, a_hi, ...] -> [c_lo, c_hi, ...], where c = a % b
#! This takes 58 cycles.
pub proc mod(b: u64, a: u64) -> u64
exec.divmod
movup.2 drop
movup.2 drop
end
# ===== DIVMOD OPERATION ==========================================================================
#! Performs divmod operation of two unsigned 64 bit integers.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_lo, b_hi, a_lo, a_hi, ...] -> [r_lo, r_hi, q_lo, q_hi, ...], where q = a / b, r = a % b.
#! This takes 54 cycles.
pub proc divmod(b: u64, a: u64) -> (remainder: u64, quotient: u64)
# Input stack: [b_lo, b_hi, a_lo, a_hi, ...]
# We verify: a = q * b + r, where q is quotient, r is remainder
# Output: [r_lo, r_hi, q_lo, q_hi, ...]
emit.U64_DIV_EVENT
# Advice stack now has [q_hi, q_lo, r_hi, r_lo]
# => [b_lo, b_hi, a_lo, a_hi, ...]
adv_push adv_push
# Pops from advice: first q_hi, then q_lo, pushes them
# => [q_lo, q_hi, b_lo, b_hi, a_lo, a_hi, ...]
u32assert2
#==========================================================================
# MULTIPLICATION BLOCK: Compute prod = q * b (must fit in 64 bits)
#==========================================================================
# Same as div procedure
dup.2
# => [b_lo, q_lo, q_hi, b_lo, b_hi, a_lo, a_hi, ...]
dup.1
# => [q_lo, b_lo, q_lo, q_hi, b_lo, b_hi, a_lo, a_hi, ...]
u32widening_mul
# outputs [lo, hi]: Stack: [p1_lo, p1_hi, q_lo, q_hi, b_lo, b_hi, a_lo, a_hi, ...]
swap
# => [p1_hi, p1_lo, q_lo, q_hi, b_lo, b_hi, a_lo, a_hi, ...]
dup.5
# => [b_hi, p1_hi, p1_lo, q_lo, q_hi, b_lo, b_hi, a_lo, a_hi, ...]
dup.3
# => [q_lo, b_hi, p1_hi, p1_lo, q_lo, q_hi, b_lo, b_hi, a_lo, a_hi, ...]
u32widening_madd
# outputs [lo, hi]: Stack: [p2_lo, p2_hi, p1_lo, q_lo, q_hi, b_lo, b_hi, a_lo, a_hi, ...]
swap
# => [p2_hi, p2_lo, p1_lo, q_lo, q_hi, b_lo, b_hi, a_lo, a_hi, ...]
eq.0
assert.err="comparison failed: divmod"
# => [p2_lo, p1_lo, q_lo, q_hi, b_lo, b_hi, a_lo, a_hi, ...]
dup.4
# => [b_lo, p2_lo, p1_lo, q_lo, q_hi, b_lo, b_hi, a_lo, a_hi, ...]
dup.4
# => [q_hi, b_lo, p2_lo, p1_lo, q_lo, q_hi, b_lo, b_hi, a_lo, a_hi, ...]
u32widening_madd
# outputs [lo, hi]: Stack: [p3_lo, p3_hi, p1_lo, q_lo, q_hi, b_lo, b_hi, a_lo, a_hi, ...]
swap
# => [p3_hi, p3_lo, p1_lo, q_lo, q_hi, b_lo, b_hi, a_lo, a_hi, ...]
eq.0
assert.err="comparison failed: divmod"
# => [p3_lo, p1_lo, q_lo, q_hi, b_lo, b_hi, a_lo, a_hi, ...]
# prod_hi = p3_lo, prod_lo = p1_lo
dup.5
# => [b_hi, p3_lo, p1_lo, q_lo, q_hi, b_lo, b_hi, a_lo, a_hi, ...]
dup.4
# => [q_hi, b_hi, p3_lo, p1_lo, q_lo, q_hi, b_lo, b_hi, a_lo, a_hi, ...]
mul
eq.0
assert.err="comparison failed: divmod"
# => [prod_hi, prod_lo, q_lo, q_hi, b_lo, b_hi, a_lo, a_hi, ...]
#==========================================================================
# REMAINDER BLOCK: Get remainder and verify b > r
#==========================================================================
adv_push adv_push
# Pops from advice: first r_hi, then r_lo
# => [r_lo, r_hi, prod_hi, prod_lo, q_lo, q_hi, b_lo, b_hi, a_lo, a_hi, ...]
u32assert2
movup.6
# => [b_lo, r_lo, r_hi, prod_hi, prod_lo, q_lo, q_hi, b_hi, a_lo, a_hi, ...]
movup.7
# => [b_hi, b_lo, r_lo, r_hi, prod_hi, prod_lo, q_lo, q_hi, a_lo, a_hi, ...]
swap
# => [b_lo, b_hi, r_lo, r_hi, prod_hi, prod_lo, q_lo, q_hi, a_lo, a_hi, ...]
dup.3
# => [r_hi, b_lo, b_hi, r_lo, r_hi, prod_hi, prod_lo, q_lo, q_hi, a_lo, a_hi, ...]
dup.3
# => [r_lo, r_hi, b_lo, b_hi, r_lo, r_hi, prod_hi, prod_lo, q_lo, q_hi, a_lo, a_hi, ...]
movup.3
# => [b_hi, r_lo, r_hi, b_lo, r_lo, r_hi, prod_hi, prod_lo, q_lo, q_hi, a_lo, a_hi, ...]
movup.3
# => [b_lo, b_hi, r_lo, r_hi, r_lo, r_hi, prod_hi, prod_lo, q_lo, q_hi, a_lo, a_hi, ...]
exec.lt
# Computes r < b, consumes both pairs
# => [1, r_lo, r_hi, prod_hi, prod_lo, q_lo, q_hi, a_lo, a_hi, ...]
assert.err="comparison failed: divmod"
# => [r_lo, r_hi, prod_hi, prod_lo, q_lo, q_hi, a_lo, a_hi, ...]
#==========================================================================
# VERIFICATION BLOCK: Check prod + r = a
#==========================================================================
# We need: sum_lo = (prod_lo + r_lo) mod 2^32, sum_hi = prod_hi + r_hi + carry_lo
# And verify: sum_lo == a_lo, sum_hi == a_hi
# We also need to keep r_lo, r_hi on the stack for output
# Current stack: [r_lo, r_hi, prod_hi, prod_lo, q_lo, q_hi, a_lo, a_hi, ...]
# Duplicate r_lo for the add (we'll need it in output)
dup.0
# => [r_lo, r_lo, r_hi, prod_hi, prod_lo, q_lo, q_hi, a_lo, a_hi, ...]
movup.4
# => [prod_lo, r_lo, r_lo, r_hi, prod_hi, q_lo, q_hi, a_lo, a_hi, ...]
u32widening_add
# outputs [sum, carry]: computes prod_lo + r_lo
# => [sum_lo, carry_lo, r_lo, r_hi, prod_hi, q_lo, q_hi, a_lo, a_hi, ...]
# Now we need to compute: sum_hi = prod_hi + r_hi + carry_lo
# Duplicate r_hi for the add (we'll need it in output)
swap
# => [carry_lo, sum_lo, r_lo, r_hi, prod_hi, q_lo, q_hi, a_lo, a_hi, ...]
dup.3
# => [r_hi, carry_lo, sum_lo, r_lo, r_hi, prod_hi, q_lo, q_hi, a_lo, a_hi, ...]
movup.5
# => [prod_hi, r_hi, carry_lo, sum_lo, r_lo, r_hi, q_lo, q_hi, a_lo, a_hi, ...]
movup.2
# => [carry_lo, prod_hi, r_hi, sum_lo, r_lo, r_hi, q_lo, q_hi, a_lo, a_hi, ...]
u32widening_add3
# outputs [sum, carry]: computes carry_lo + prod_hi + r_hi
# => [sum_hi, carry_hi, sum_lo, r_lo, r_hi, q_lo, q_hi, a_lo, a_hi, ...]
swap
# => [carry_hi, sum_hi, sum_lo, r_lo, r_hi, q_lo, q_hi, a_lo, a_hi, ...]
eq.0
assert.err="comparison failed: divmod"
# => [sum_hi, sum_lo, r_lo, r_hi, q_lo, q_hi, a_lo, a_hi, ...]
# Verify sum_hi == a_hi
movup.7
# => [a_hi, sum_hi, sum_lo, r_lo, r_hi, q_lo, q_hi, a_lo, ...]
assert_eq.err="comparison failed: divmod"
# => [sum_lo, r_lo, r_hi, q_lo, q_hi, a_lo, ...]
# Verify sum_lo == a_lo
movup.5
# => [a_lo, sum_lo, r_lo, r_hi, q_lo, q_hi, ...]
assert_eq.err="comparison failed: divmod"
# => [r_lo, r_hi, q_lo, q_hi, ...]
end
# ===== BITWISE OPERATIONS ========================================================================
#! Performs bitwise AND of two unsigned 64-bit integers.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_lo, b_hi, a_lo, a_hi, ...] -> [c_lo, c_hi, ...], where c = a AND b.
#! This takes 5 cycles.
pub proc and(b: u64, a: u64) -> u64
movup.2 # => [a_lo, b_lo, b_hi, a_hi]
u32and # => [c_lo, b_hi, a_hi]
swap.2 # => [a_hi, b_hi, c_lo]
u32and # => [c_hi, c_lo]
swap # => [c_lo, c_hi]
end
#! Performs bitwise OR of two unsigned 64 bit integers.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_lo, b_hi, a_lo, a_hi, ...] -> [c_lo, c_hi, ...], where c = a OR b.
#! This takes 5 cycles.
pub proc or(b: u64, a: u64) -> u64
movup.2 # => [a_lo, b_lo, b_hi, a_hi]
u32or # => [c_lo, b_hi, a_hi]
swap.2 # => [a_hi, b_hi, c_lo]
u32or # => [c_hi, c_lo]
swap # => [c_lo, c_hi]
end
#! Performs bitwise XOR of two unsigned 64 bit integers.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_lo, b_hi, a_lo, a_hi, ...] -> [c_lo, c_hi, ...], where c = a XOR b.
#! This takes 5 cycles.
pub proc xor(b: u64, a: u64) -> u64
movup.2 # => [a_lo, b_lo, b_hi, a_hi]
u32xor # => [c_lo, b_hi, a_hi]
swap.2 # => [a_hi, b_hi, c_lo]
u32xor # => [c_hi, c_lo]
swap # => [c_lo, c_hi]
end
#! Performs bitwise NOT of one unsigned 64 bit integer.
#! The input value is assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [a_lo, a_hi, ...] -> [c_lo, c_hi, ...], where c = !a (i.e., 2^64 - 1 - a).
#! This takes 4 cycles.
pub proc not(a: u64) -> u64
u32not # [!a_lo, a_hi, ...]
swap # [a_hi, !a_lo, ...]
u32not # [!a_hi, !a_lo, ...]
swap # [!a_lo, !a_hi, ...]
end
#! Performs left shift of one unsigned 64-bit integer.
#! The input value to be shifted is assumed to be represented using 32 bit limbs, but this is not checked.
#! The shift value n should be in the range [0, 64), otherwise it will result in an error.
#! Stack transition looks as follows:
#! [n, a_lo, a_hi, ...] -> [c_lo, c_hi, ...], where c = (a << n) mod 2^64.
#! This takes 21 cycles.
pub proc shl(n: u32, a: u64) -> u64
pow2 # [2^n, a_lo, a_hi]
u32split # [pow_lo, pow_hi, a_lo, a_hi]
movup.2 # [a_lo, pow_lo, pow_hi, a_hi]
movup.3 # [a_hi, a_lo, pow_lo, pow_hi]
swap # [a_lo, a_hi, pow_lo, pow_hi]
exec.wrapping_mul
end
#! Performs right shift of one unsigned 64-bit integer.
#! The input value to be shifted is assumed to be represented using 32 bit limbs, but this is not checked.
#! The shift value n should be in the range [0, 64), otherwise it will result in an error.
#! Stack transition looks as follows:
#! [n, a_lo, a_hi, ...] -> [c_lo, c_hi, ...], where c = a >> n.
#! This takes 60 or 61 cycles, depending on n.
pub proc shr(n: u32, a: u64) -> u64
# ==========================================================================
# RIGHT SHIFT: Computes a >> n where a is 64-bit and n is shift amount
# Input: [n, a_lo, a_hi, ...] where a = a_hi * 2^32 + a_lo
# Output: [c_lo, c_hi, ...] where c = a >> n
# ==========================================================================
# --- STEP 1: Compute 2^n and split into hi/lo ---
pow2
# [2^b, a_lo, a_hi, ...]
u32split swap
# [pow_hi, pow_lo, a_lo, a_hi, ...]
# For b=0: pow_hi=0, pow_lo=1
# For b=1: pow_hi=0, pow_lo=2
# For b=32: pow_hi=1, pow_lo=0
# --- STEP 2: Compute divisor = pow_hi + pow_lo ---
dup.1
# [pow_lo, pow_hi, pow_lo, a_lo, a_hi, ...]
add
# [pow_hi+pow_lo, pow_lo, a_lo, a_hi, ...]
# For b=0: divisor=1, pow_lo=1
# For b=1: divisor=2, pow_lo=2
# --- STEP 3: First division - a_hi / divisor ---
movup.3
# [a_hi, pow_hi+pow_lo, pow_lo, a_lo, ...]
swap
# [pow_hi+pow_lo, a_hi, pow_lo, a_lo, ...]
# u32divmod: [divisor, dividend, ...] -> [remainder, quotient, ...]
u32divmod
swap
# [quot1, rem1, pow_lo, a_lo, ...]
# quot1 = a_hi / divisor
# rem1 = a_hi % divisor
# --- STEP 4: Rearrange for second division ---
movup.3
# [a_lo, quot1, rem1, pow_lo, ...]
movup.3
# [pow_lo, a_lo, quot1, rem1, ...]
# --- STEP 5: Handle division by zero case (when pow_lo=0) ---
dup
# [pow_lo, pow_lo, a_lo, quot1, rem1, ...]
eq.0
# [is_zero, pow_lo, a_lo, quot1, rem1, ...]
# is_zero = 1 if pow_lo==0, else 0
# We want: adjusted_divisor = pow_lo - is_zero
# This gives pow_lo when pow_lo>0, or -1 (0xFFFFFFFF) when pow_lo=0
# u32overflowing_sub computes second - top = pow_lo - is_zero
u32overflowing_sub
# Computes: pow_lo - is_zero
# [borrow, adjusted_div, a_lo, quot1, rem1, ...]
# For pow_lo=1, is_zero=0: adjusted_div=1, borrow=0
# For pow_lo=2, is_zero=0: adjusted_div=2, borrow=0
# For pow_lo=0, is_zero=1: adjusted_div=0xFFFFFFFF, borrow=1
not
# [!borrow, adjusted_div, a_lo, quot1, rem1, ...]
# !borrow = 1 when no underflow (normal case), 0 when underflow (b>=32 case)
movdn.4
# [adjusted_div, a_lo, quot1, rem1, !borrow, ...]
dup
# [adjusted_div, adjusted_div, a_lo, quot1, rem1, !borrow, ...]
movdn.4
# [adjusted_div, a_lo, quot1, rem1, adjusted_div, !borrow, ...]
# --- STEP 6: Second division - a_lo / adjusted_div ---
# => [adjusted_div, a_lo, quot1, rem1, adjusted_div, !borrow, ...]
# u32divmod: [divisor, dividend, ...] -> [remainder, quotient, ...]
u32divmod
swap
# [quot2, rem2, quot1, rem1, adjusted_div, !borrow, ...]
# quot2 = a_lo / adjusted_div
# rem2 = a_lo % adjusted_div
swap drop
# [quot2, quot1, rem1, adjusted_div, !borrow, ...]
# --- STEP 7: Compute multiplier = (!borrow * 2^32) / adjusted_div ---
push.4294967296
# [2^32, quot2, quot1, rem1, adjusted_div, !borrow, ...]
dup.5
# [!borrow, 2^32, quot2, quot1, rem1, adjusted_div, !borrow, ...]
mul
# [!borrow*2^32, quot2, quot1, rem1, adjusted_div, !borrow, ...]
# This is 2^32 when !borrow=1 (normal case), 0 when !borrow=0
movup.4
# [adjusted_div, !borrow*2^32, quot2, quot1, rem1, !borrow, ...]
div
# [multiplier, quot2, quot1, rem1, !borrow, ...]
# multiplier = (!borrow * 2^32) / adjusted_div
# --- STEP 8: Compute c_lo = rem1 * multiplier + quot2 ---
movup.3
# [rem1, multiplier, quot2, quot1, !borrow, ...]
mul
# [rem1*multiplier, quot2, quot1, !borrow, ...]
add
# [c_lo, quot1, !borrow, ...]
# c_lo = rem1 * multiplier + quot2
# This add is safe: when !borrow=1, multiplier = 2^32 / pow_lo and rem1 < pow_lo, so
# rem1 * multiplier <= 2^32 - multiplier and adding quot2 < multiplier stays below 2^32.
# --- STEP 9: Conditional swap to get final result ---
dup.2
# [!borrow, c_lo, quot1, !borrow, ...]
cswap
# if !borrow=1 (normal case): swap c_lo and quot1 -> [quot1, c_lo, !borrow, ...]
# if !borrow=0 (b>=32 case): no swap -> [c_lo, quot1, !borrow, ...]
# result in BE format with flag: [c_hi, c_lo, !borrow, ...]
# when n >= 32, c_hi must be zero. multiply c_hi by !borrow to enforce this:
# !borrow=1 keeps c_hi unchanged for n<32; !borrow=0 clears c_hi for n>=32.
movup.2
# [!borrow, c_hi, c_lo, ...]
mul
# [c_hi * !borrow, c_lo, ...]
swap
# [c_lo, c_hi, ...]
end
#! Performs left rotation of one unsigned 64-bit integer.
#! The input value to be rotated is assumed to be represented using 32 bit limbs, but this is not checked.
#! The rotation amount n should be in the range [0, 64), otherwise it will result in an error.
#! Stack transition looks as follows:
#! [n, a_lo, a_hi, ...] -> [c_lo, c_hi, ...], where c = a <<< n (rotate left).
#! This takes 46 cycles.
pub proc rotl(n: u32, a: u64) -> u64
# ==========================================================================
# LEFT ROTATION: Computes a <<< n (rotate left by n bits)
# Input: [n, a_lo, a_hi, ...] where a = a_hi * 2^32 + a_lo
# Output: [c_lo, c_hi, ...] where c = a <<< n
# ==========================================================================
# --- STEP 1: Compute swap_flag = (31 - n) borrow, i.e., n > 31 ---
push.31
# [31, b, a_lo, a_hi, ...]
dup.1
# [b, 31, b, a_lo, a_hi, ...]
u32overflowing_sub
# Computes: 31 - b
# [borrow, diff, b, a_lo, a_hi, ...]
# borrow = 1 if b > 31, else 0
swap
# [diff, borrow, b, a_lo, a_hi, ...]
drop
# [borrow, b, a_lo, a_hi, ...]
# borrow indicates if b > 31 (swap_flag)
movdn.3
# [b, a_lo, a_hi, borrow, ...]
# --- STEP 2: Compute shift_amt = b & 31 and 2^shift_amt ---
push.31
# [31, b, a_lo, a_hi, borrow, ...]
u32and
# [b&31, a_lo, a_hi, borrow, ...]
# shift_amt = b mod 32
pow2
# [2^shift_amt, a_lo, a_hi, borrow, ...]
dup
# [2^shift_amt, 2^shift_amt, a_lo, a_hi, borrow, ...]
movup.2
# [a_lo, 2^shift_amt, 2^shift_amt, a_hi, borrow, ...]
# --- STEP 3: Shift the low limb: a_lo * 2^shift_amt ---
u32widening_mul
# outputs [lo, hi]: lo = (a_lo * 2^shift_amt) mod 2^32, hi = overflow
# [lo1, hi1, 2^shift_amt, a_hi, borrow, ...]
swap
# [hi1, lo1, 2^shift_amt, a_hi, borrow, ...]
# --- STEP 4: Shift the high limb: a_hi * 2^shift_amt + hi1 ---
movup.3
# [a_hi, hi1, lo1, 2^shift_amt, borrow, ...]
movup.3
# [2^shift_amt, a_hi, hi1, lo1, borrow, ...]
# u32widening_madd: computes top * second + third
# Input: [2^shift_amt, a_hi, hi1, ...]
# Computes: 2^shift_amt * a_hi + hi1
# outputs [lo, hi]
u32widening_madd
# [lo2, hi2, lo1, borrow, ...]
# lo2 = (a_hi * 2^shift_amt + hi1) mod 2^32
# hi2 = overflow (bits that wrap around)
# --- STEP 5: Carry the overflow to the low bits ---
# c_lo_computed = lo1 + hi2 (the rotation overflow wraps to low bits)
movdn.2
# [hi2, lo1, lo2, borrow, ...]
add
# [c_lo_computed, lo2, borrow, ...]
# --- STEP 6: Conditionally swap based on b > 31 ---
movup.2
# [borrow, c_lo_computed, lo2, ...]
cswap
# borrow=1 (b>31): [lo2, c_lo_computed, ...]
# borrow=0 (b<=31): [c_lo_computed, lo2, ...]
# In both cases the top is c_lo and the second is c_hi.
end
#! Performs right rotation of one unsigned 64-bit integer.
#! The input value to be rotated is assumed to be represented using 32 bit limbs, but this is not checked.
#! The rotation amount n should be in the range [0, 64), otherwise it will result in an error.
#! Stack transition looks as follows:
#! [n, a_lo, a_hi, ...] -> [c_lo, c_hi, ...], where c = a >>> n (rotate right).
#! This takes 60 cycles.
pub proc rotr(n: u32, a: u64) -> u64
# ==========================================================================
# RIGHT ROTATION: Computes a >>> n (rotate right by n bits)
# Input: [n, a_lo, a_hi, ...] where a = a_hi * 2^32 + a_lo
# Output: [c_lo, c_hi, ...] where c = a >>> n
#
# Strategy: Rotate right by n = left rotate by (64 - n)
# For n in [0,64), we compute left shift by ((32 - (n&31)) & 31)
# and conditionally swap limbs when the rotation crosses the 32-bit boundary
# ==========================================================================
# --- STEP 1: Compute is_upper_half = (n > 31) ---
push.31
# [31, b, a_lo, a_hi, ...]
dup.1
# [b, 31, b, a_lo, a_hi, ...]
u32overflowing_sub
# [borrow, diff, b, a_lo, a_hi, ...]
# borrow = 1 if b > 31, else 0
swap
# [diff, borrow, b, a_lo, a_hi, ...]
drop
# [borrow, b, a_lo, a_hi, ...]
# borrow indicates whether n is in the upper half [32, 63]
movdn.3
# [b, a_lo, a_hi, borrow, ...]
# --- STEP 2: Compute complement shift amount = (32 - (b & 31)) & 31 ---
push.31
# [31, b, a_lo, a_hi, borrow, ...]
u32and
# [b&31, a_lo, a_hi, borrow, ...]
# shift_amt = b mod 32
dup
# [shift_amt, shift_amt, a_lo, a_hi, borrow, ...]
neq.0
not
# [shift_amt == 0, shift_amt, a_lo, a_hi, borrow, ...]
movdn.4
# [shift_amt, a_lo, a_hi, borrow, shift_amt == 0, ...]
push.32
# [32, shift_amt, a_lo, a_hi, borrow, shift_amt == 0, ...]
swap
# [shift_amt, 32, a_lo, a_hi, borrow, shift_amt == 0, ...]
u32wrapping_sub
# [32-shift_amt, a_lo, a_hi, borrow, shift_amt == 0, ...]
push.31
# [31, 32-shift_amt, a_lo, a_hi, borrow, shift_amt == 0, ...]
u32and
# [(32-shift_amt)&31, a_lo, a_hi, borrow, shift_amt == 0, ...]
# complement_shift = (32 - (b mod 32)) mod 32
pow2
# [2^complement_shift, a_lo, a_hi, borrow, shift_amt == 0, ...]
dup
# [2^complement_shift, 2^complement_shift, a_lo, a_hi, borrow, shift_amt == 0, ...]
movup.2
# [a_lo, 2^complement_shift, 2^complement_shift, a_hi, borrow, shift_amt == 0, ...]
# --- STEP 3: Left shift low limb by complement amount ---
u32widening_mul
# [lo1, hi1, 2^complement_shift, a_hi, borrow, shift_amt == 0, ...]
# lo1 = (a_lo * 2^complement_shift) mod 2^32
# hi1 = overflow bits (shift into high)
swap
# [hi1, lo1, 2^complement_shift, a_hi, borrow, shift_amt == 0, ...]
# --- STEP 4: Left shift high limb by complement amount and add overflow ---
movup.3
# [a_hi, hi1, lo1, 2^complement_shift, borrow, shift_amt == 0, ...]
movup.3
# [2^complement_shift, a_hi, hi1, lo1, borrow, shift_amt == 0, ...]
u32widening_madd
# [lo2, hi2, lo1, borrow, shift_amt == 0, ...]
# lo2 = (a_hi * 2^complement_shift + hi1) mod 2^32
# hi2 = overflow bits (wrap around to low)
# --- STEP 5: Carry the overflow to the low bits ---
# c_lo_computed = lo1 + hi2 (overflow wraps to low bits)
# The add cannot overflow: hi2 < 2^complement_shift and
# lo1 = (a_lo * 2^complement_shift) mod 2^32 is a multiple of 2^complement_shift,
# so lo1 + hi2 < 2^32.
movdn.2
# [hi2, lo1, lo2, borrow, shift_amt == 0, ...]
add
# [c_lo_computed, lo2, borrow, shift_amt == 0, ...]
# --- STEP 6: Conditionally swap based on (n > 31) == (n mod 32 == 0) ---
movup.2
# [borrow, c_lo_computed, lo2, shift_amt == 0, ...]
movup.3
# [shift_amt == 0, borrow, c_lo_computed, lo2, ...]
eq
# [swap_flag, c_lo_computed, lo2, ...]
# swap_flag = 1 for n in [1, 32] and 0 for n in {0} U [33, 63]
cswap
# swap_flag=1: [lo2, c_lo_computed, ...]
# swap_flag=0: [c_lo_computed, lo2, ...]
# In both cases the top is c_lo and the second is c_hi.
end
#! Counts the number of leading zeros of one unsigned 64-bit integer.
#! The input value is assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [n_lo, n_hi, ...] -> [clz, ...], where clz is the number of leading zeros of value n.
#! This takes 48 cycles.
pub proc clz(n: u64) -> u8
# Input: [n_lo, n_hi, ...]
swap # [n_hi, n_lo, ...]
dup.0
eq.0
if.true # if n_hi == 0
drop
u32clz
add.32 # clz(n_lo) + 32
else
swap
drop
u32clz # clz(n_hi)
end
end
#! Counts the number of trailing zeros of one unsigned 64-bit integer.
#! The input value is assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [n_lo, n_hi, ...] -> [ctz, ...], where ctz is the number of trailing zeros of value n.
#! This takes 41 cycles.
pub proc ctz(n: u64) -> u8
# Input: [n_lo, n_hi, ...]
dup.0
eq.0
if.true # if n_lo == 0
drop
u32ctz
add.32 # ctz(n_hi) + 32
else
swap
drop
u32ctz # ctz(n_lo)
end
end
#! Counts the number of leading ones of one unsigned 64-bit integer.
#! The input value is assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [n_lo, n_hi, ...] -> [clo, ...], where clo is the number of leading ones of value n.
#! This takes 47 cycles.
pub proc clo(n: u64) -> u8
# Input: [n_lo, n_hi, ...]
swap # [n_hi, n_lo, ...]
dup.0
eq.4294967295
if.true # if n_hi == 11111111111111111111111111111111
drop
u32clo
add.32 # clo(n_lo) + 32
else
swap
drop
u32clo # clo(n_hi)
end
end
#! Counts the number of trailing ones of one unsigned 64-bit integer.
#! The input value is assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [n_lo, n_hi, ...] -> [cto, ...], where cto is the number of trailing ones of value n.
#! This takes 40 cycles.
pub proc cto(n: u64) -> u8
# Input: [n_lo, n_hi, ...]
dup.0
eq.4294967295
if.true # if n_lo == 11111111111111111111111111111111
drop
u32cto
add.32 # cto(n_hi) + 32
else
swap
drop
u32cto # cto(n_lo)
end
end