arcium-primitives 0.4.0

use itertools::izip;

use crate::{algebra::field::mersenne::Mersenne107, izip_eq};

const M107_MODULUS: u128 = Mersenne107::MODULUS;
const M107_NUM_BITS: usize = Mersenne107::NUM_BITS;

///////////////////////////////////////////////////////////////////////////////////////////////////
// Modulus reduction
///////////////////////////////////////////////////////////////////////////////////////////////////

/// Reduces an u128 value modulo 2^107-1. Input can be 128-bit wide.
#[inline]
pub(super) fn reduce_mod_inplace(val: &mut u128) {
    *val = (*val >> M107_NUM_BITS) + (*val & M107_MODULUS);
    reduce_mod_1bit_inplace(val);
    debug_assert!(*val < M107_MODULUS);
}

/// Reduces an u128 value modulo 2^107-1. There are no bounds on the input limbs.
#[inline(always)]
pub(super) fn reduce_mod(mut val: u128) -> u128 {
    reduce_mod_inplace(&mut val);
    val
}

/// Reduces an u128 value modulo 2^107-1 inplace. The input value must be less than 2 * (2^107-2) +
/// 2.
#[inline]
pub(super) fn reduce_mod_1bit_inplace(val: &mut u128) {
    debug_assert!(*val < 2 * M107_MODULUS + 2);
    *val = (*val + u128::from(*val >= M107_MODULUS)) & M107_MODULUS;
    debug_assert!(*val < M107_MODULUS);
}

#[inline(always)]
pub(super) fn reduce_mod_1bit(mut val: u128) -> u128 {
    reduce_mod_1bit_inplace(&mut val);
    val
}

///////////////////////////////////////////////////////////////////////////////////////////////////
// Multiplication
///////////////////////////////////////////////////////////////////////////////////////////////////

/// Computes `acc + (b * c)` as u128 and returns the result split in 2 u64.
#[inline(always)]
const fn mac(acc: u64, b: u64, c: u64) -> (u64, u64) {
    let ret = (acc as u128) + ((b as u128) * (c as u128));
    (ret as u64, (ret >> 64) as u64)
}

/// Multiplies two 2-limb values and output the 4-limb result.
///
/// The output limbs have 214 bits of data (i.e., 2x the Mersenne107 bit-length), the first 3
/// limbs have 64 bits and the last one 22 bits.
#[inline]
pub(super) fn mul_wide(a: u128, b: u128) -> [u64; 4] {
    #[inline(always)]
    const fn split_u64(a: u128) -> [u64; 2] {
        [a as u64, (a >> 64) as u64]
    }

    let a = split_u64(a);
    let b = split_u64(b);

    let (k0, k1) = mac(0, a[0], b[0]); // 64 and 64 bits
    let (k1, k2a) = mac(k1, a[0], b[1]); // 64 and 43 bits
    let (k1, k2b) = mac(k1, a[1], b[0]); // 64 and 43 bits
    let (k2, k3) = mac(k2a + k2b, a[1], b[1]); // 64 and 22 bits

    [k0, k1, k2, k3]
}

#[inline]
fn reduce_wide(a: &[u64; 4]) -> u128 {
    const K: usize = 64;
    const K1: usize = M107_NUM_BITS - K;
    const MASK1: u64 = (1u64 << K1) - 1;
    const K2: usize = 2 * K - M107_NUM_BITS;

    let r0 = a[0] as u128;
    let r1_0 = ((a[1] & MASK1) as u128) << K;
    let r1_1 = (a[1] >> K1) as u128;
    let r2 = (a[2] as u128) << K2;
    let r3 = (a[3] as u128) << (K2 + K);
    let r = r0 + r1_0 + r1_1 + r2 + r3; // 108 bits and < 2^108 - 3

    reduce_mod_1bit(r)
}

/// Multiplies 2 M107 values. Return a M107 reduced value.
#[inline]
pub(super) fn mul(a: u128, b: u128) -> u128 {
    let res_wide = mul_wide(a, b);
    reduce_wide(&res_wide)
}

///////////////////////////////////////////////////////////////////////////////////////////////////
// Accumulate with lazy modulus reduction
///////////////////////////////////////////////////////////////////////////////////////////////////

/// Intermediate representation of an M107 value with free space for accumulate operations.
///
/// If `v` is the array stored inside an `M107AccRepr`, then the encoded, unreduced value is `v[0] +
/// v[1].2^32 + v[2].2^64 + v[3].2^96`.
///
/// Up to 2^31 values can be cumulated.
#[derive(Clone, Copy, Debug)]
pub struct M107AccRepr([u64; 4]);

impl M107AccRepr {
    #[inline]
    pub fn zero() -> Self {
        Self([0; 4])
    }

    #[inline]
    pub fn new(val: u128) -> Self {
        let mut res = Self::zero();
        // Fill the first 4 limbs with 32-bit chunks of input value
        res.0[0] = val as u32 as u64;
        res.0[1] = (val >> 32) as u32 as u64;
        res.0[2] = (val >> 64) as u32 as u64;
        res.0[3] = (val >> 96) as u32 as u64;
        res
    }

    #[inline]
    pub fn add_assign(&mut self, rhs: &Self) {
        izip_eq!(&mut self.0, &rhs.0).for_each(|(acc, a)| {
            *acc += a;
        });
    }

    #[inline]
    pub fn reduce(self) -> u128 {
        const K1: usize = M107_NUM_BITS - 3 * 32;
        const MASK1: u64 = (1u64 << K1) - 1;

        let a = self.0;

        let r = (a[0] as u128)
            + ((a[1] as u128) << 32)
            + ((a[2] as u128) << 64)
            + (((a[3] & MASK1) as u128) << 96)
            + ((a[3] >> K1) as u128);

        reduce_mod(r)
    }
}

#[cfg(test)]
impl M107AccRepr {
    fn new_from_array(val: [u64; 4]) -> Self {
        Self(val)
    }

    pub fn to_bigint(self) -> num_bigint::BigInt {
        use num_bigint::BigInt;
        let a = self.0;

        BigInt::from(a[0])
            + (BigInt::from(a[1]) << 32)
            + (BigInt::from(a[2]) << 64)
            + (BigInt::from(a[3]) << 96)
    }
}

///////////////////////////////////////////////////////////////////////////////////////////////////
// Multiply-accumulate with lazy modulus reduction
///////////////////////////////////////////////////////////////////////////////////////////////////

/// Intermediate representation of an M107 value with free space for multiply-accumulate operations.
///
/// Limbs are stored interleaved. If `v` is the array stored inside an `M107MulAccRepr`, then the
/// encoded, unreduced value is `v[0] + v[4].2^32 + v[1].2^64 + v[5].2^96 + v[2].2^128 + v[6].2^160
/// + v[3].2^192`.
///
/// The last limb `v[7]` is not used.
///
/// Up to 2^31 values can be multiply-accumulated.
#[derive(Clone, Copy, Debug)]
pub struct M107MulAccRepr([u64; 8]);

impl M107MulAccRepr {
    #[inline]
    pub fn zero() -> Self {
        Self([0; 8])
    }

    #[inline]
    pub fn new(val: u128) -> Self {
        let mut res = Self::zero();
        // Fill 4 limbs (interleaved) with 32-bit chunks of input value
        res.0[0] = val as u32 as u64;
        res.0[4] = (val >> 32) as u32 as u64;
        res.0[1] = (val >> 64) as u32 as u64;
        res.0[5] = (val >> 96) as u32 as u64;
        res
    }

    #[inline]
    fn new_from_wide(val: &[u64; 4]) -> Self {
        #[inline(always)]
        const fn split_u32(a: u64) -> [u32; 2] {
            [a as u32, (a >> 32) as u32]
        }

        let mut res = Self::zero();
        val.iter().enumerate().for_each(|(i, e)| {
            let [a0, a1] = split_u32(*e);
            res.0[i] = a0 as u64;
            res.0[i + 4] = a1 as u64;
        });
        res
    }

    #[inline]
    pub fn add_assign(&mut self, rhs: &Self) {
        izip!(&mut self.0, &rhs.0).for_each(|(acc, a)| {
            *acc += a;
        });
    }

    /// Multiplies two 2-limb values and output the extra-wide result.
    ///
    /// The output limbs have 214 bits of data (i.e., 2x the Mersenne107 bit-length), the first 6
    /// limbs have 32 bits, and the one before the last has 22 bits. The last limb is not used.  The
    /// slack in each limb permits allows accumulating these values (up to 2^31 times) and
    /// performing a lazy-modulus reduction afterward.
    #[inline]
    pub fn mul(a: u128, b: u128) -> Self {
        Self::new_from_wide(&mul_wide(a, b))
    }

    /// Reduces extra-wide representation to a M107 value.
    #[inline]
    pub fn reduce(self) -> u128 {
        let a = self.0;

        const K0: usize = 3 * 32;
        const K1: usize = M107_NUM_BITS - 3 * 32;
        const K2: usize = 2 * M107_NUM_BITS - 6 * 32;
        const K3: usize = 6 * 32 - M107_NUM_BITS;
        const MASK1: u128 = (1u128 << K1) - 1;
        const MASK2: u64 = (1u64 << K2) - 1;

        let r0 = (a[0] as u128) + ((a[4] as u128) << 32) + ((a[1] as u128) << 64);
        let r1 = (a[5] as u128) + ((a[2] as u128) << 32) + ((a[6] as u128) << 64);
        let r2 = a[3];

        let r = r0
            + ((r1 & MASK1) << K0)
            + (r1 >> K1)
            + (((r2 & MASK2) as u128) << K3)
            + ((r2 >> K2) as u128);

        reduce_mod(r)
    }
}

#[cfg(test)]
impl M107MulAccRepr {
    fn new_from_array(val: &[u64; 8]) -> Self {
        let mut res = Self::zero();
        res.0[0] = val[0];
        res.0[4] = val[1];
        res.0[1] = val[2];
        res.0[5] = val[3];
        res.0[2] = val[4];
        res.0[6] = val[5];
        res.0[3] = val[6];
        res.0[7] = val[7];
        res
    }

    pub fn to_bigint(self) -> num_bigint::BigInt {
        use num_bigint::BigInt;
        let a = self.0;

        BigInt::from(a[0])
            + (BigInt::from(a[4]) << 32)
            + (BigInt::from(a[1]) << 64)
            + (BigInt::from(a[5]) << 96)
            + (BigInt::from(a[2]) << 128)
            + (BigInt::from(a[6]) << 160)
            + (BigInt::from(a[3]) << 192)
    }
}

#[cfg(test)]
mod test {
    use num_bigint::{BigInt, BigUint};
    use rand::Rng;
    use typenum::Unsigned;

    use crate::{
        algebra::field::mersenne::{
            m107_ops::{
                mul_wide,
                reduce_mod,
                reduce_mod_1bit,
                reduce_wide,
                M107AccRepr,
                M107MulAccRepr,
                M107_MODULUS,
            },
            test::bigint_to_m107,
        },
        random::test_rng,
    };

    #[allow(dead_code)] // it's used in the tests
    const M107_MAX: u128 = M107_MODULUS - 1;

    type M = typenum::U1000;

    #[test]
    fn test_ops_reduce_mod() {
        let mut rng = test_rng();
        for _ in 0..M::to_usize() {
            let a = rng.gen::<u128>();
            let a_red_true = a % M107_MODULUS;
            let a_red_actual = reduce_mod(a);
            assert_eq!(a_red_true, a_red_actual);
        }
    }

    #[test]
    fn test_ops_reduce_mod_1bit() {
        fn test_internal(a: u128) {
            let a_red_true = a % M107_MODULUS;
            let a_red_actual = reduce_mod_1bit(a);
            assert_eq!(a_red_true, a_red_actual, "a = {a}");
        }

        let mut rng = test_rng();
        const MAX_VAL_1: u128 = (1u128 << 108) - 2;
        for _ in 0..M::to_usize() {
            let a = rng.gen::<u128>() % MAX_VAL_1;
            test_internal(a);
        }

        // Corner cases
        test_internal(MAX_VAL_1 - 1);
        test_internal(0);
        test_internal(1);
        test_internal(M107_MODULUS);
        test_internal(M107_MAX);
    }

    #[test]
    fn test_ops_reduce_wide() {
        fn test_internal(a: [u64; 4]) {
            let a_red_true = bigint_to_m107(
                BigInt::from(a[0])
                    + (BigInt::from(a[1]) << 64)
                    + (BigInt::from(a[2]) << 128)
                    + (BigInt::from(a[3]) << 192),
            )
            .0;
            let a_red_actual = reduce_wide(&a);
            assert_eq!(a_red_true, a_red_actual, "a = {a:?}");
        }

        // Random values test
        let mut rng = test_rng();
        for _ in 0..M::to_usize() {
            let a = rng.gen::<[u128; 2]>();
            let t: BigUint = (BigUint::from(a[0]) + (BigUint::from(a[1]) << 128))
                % (BigUint::from(M107_MAX) * BigUint::from(M107_MAX));
            let a = t.to_u64_digits().try_into().unwrap();
            test_internal(a);
        }

        // Corner cases
        test_internal([0; 4]);
        test_internal([
            0x4u64,
            0xffffe00000000000u64,
            0xffffffffffffffffu64,
            0x3fffffu64,
        ]); // Mersenne107::MAX * Mersenne107::MAX
    }

    #[test]
    fn test_ops_mul_wide() {
        fn test_internal(a: u128, b: u128) {
            let prod_exp = bigint_to_m107(BigInt::from(a) * BigInt::from(b)).0;
            let prod_act = reduce_wide(&mul_wide(a, b));
            assert_eq!(prod_exp, prod_act, "a = {a}, b = {b}");
        }

        let mut rng = test_rng();
        for _ in 0..M::to_usize() {
            let a = rng.gen::<u128>() % M107_MODULUS;
            let b = rng.gen::<u128>() % M107_MODULUS;
            test_internal(a, b);
        }

        // Corner cases
        test_internal(0, 0);
        test_internal(0, M107_MAX);
        test_internal(1, M107_MAX);
        test_internal(M107_MAX, M107_MAX);
    }

    #[test]
    fn test_mul_acc_repr_reduce() {
        fn test_internal(a: M107MulAccRepr) {
            let a_red_true = bigint_to_m107(a.to_bigint()).0;
            let a_red_actual = a.reduce();
            assert_eq!(a_red_true, a_red_actual, "a = {a:?}");
        }

        // Random values test
        let mut rng = test_rng();
        for _ in 0..M::to_usize() {
            let mut a = rng.gen::<[u64; 8]>();
            // Cap input to 2^31 dot-product
            for e in a[0..6].iter_mut() {
                *e &= 0x7fffffffffffffff; // 32+31 bits
            }
            a[6] &= 0x1fffffffffffff; // 22+31 bits
            test_internal(M107MulAccRepr::new_from_array(&a));
        }

        // Corner cases
        test_internal(M107MulAccRepr::zero());
        test_internal(M107MulAccRepr::new_from_array(&[
            0x200000000u64,
            0x0,
            0x0,
            0x7ffff00000000000,
            0x7fffffff80000000,
            0x7fffffff80000000,
            0x1fffff80000000,
            0x0,
        ])); // maximum 8-limbs value dot-product of vectors of size 2^31
    }
    #[test]
    fn test_mul_acc_repr_mul() {
        fn test_internal(a: u128, b: u128) {
            let prod_exp = bigint_to_m107(BigInt::from(a) * BigInt::from(b)).0;
            let prod_act = M107MulAccRepr::mul(a, b).reduce();
            assert_eq!(prod_exp, prod_act, "a = {a}, b = {b}");
        }

        let mut rng = test_rng();
        for _ in 0..M::to_usize() {
            let a = rng.gen::<u128>() % M107_MODULUS;
            let b = rng.gen::<u128>() % M107_MODULUS;
            test_internal(a, b);
        }

        // Corner cases
        test_internal(0, 0);
        test_internal(0, M107_MAX);
        test_internal(1, M107_MAX);
        test_internal(M107_MAX, M107_MAX);
    }
    #[test]
    fn test_acc_repr_reduce() {
        fn test_internal(a: M107AccRepr) {
            let a_red_true = bigint_to_m107(a.to_bigint()).0;
            let a_red_actual = a.reduce();
            assert_eq!(a_red_true, a_red_actual, "a = {a:?}");
        }

        // Random values test
        let mut rng = test_rng();
        for _ in 0..M::to_usize() {
            let mut a = rng.gen::<[u64; 4]>();
            // Cap input to 2^31 dot-product
            for e in a[0..3].iter_mut() {
                *e &= 0x7fffffffffffffff; // 32+31 bits
            }
            a[3] &= 0x1fffffffffffff; // 22+31 bits
            test_internal(M107AccRepr::new_from_array(a));
        }

        // Corner cases
        test_internal(M107AccRepr::zero());
        test_internal(M107AccRepr::new_from_array([
            0x7fffffff00000000u64,
            0x7fffffff80000000,
            0x7fffffff80000000,
            0x3ff80000000,
        ])); // maximum 4-limbs value of 2^31-sized sum
    }
    #[test]
    fn test_acc_repr_add_assign() {
        fn test_internal(a: u128, b: u128) {
            let mut a = M107AccRepr::new(a);
            let b = M107AccRepr::new(b);
            let prod_exp = bigint_to_m107(a.to_bigint() + b.to_bigint()).0;
            a.add_assign(&b);
            let prod_act = a.reduce();
            assert_eq!(prod_exp, prod_act, "a = {a:?}, b = {b:?}");
        }

        let mut rng = test_rng();
        for _ in 0..M::to_usize() {
            let a = rng.gen::<u128>() % M107_MODULUS;
            let b = rng.gen::<u128>() % M107_MODULUS;
            test_internal(a, b);
        }

        // Corner cases
        test_internal(0, 0);
        test_internal(0, M107_MAX);
        test_internal(1, M107_MAX);
        test_internal(M107_MAX, M107_MAX);
    }
}