cnfy-uint 0.2.3

//! Single Euclidean step using an approximate u64 quotient with correction.

use super::GcdState;
use crate::u320::U320;

impl GcdState {
    /// Performs one Euclidean step using an approximate u64 quotient,
    /// correcting it to the exact value via U320 comparison loops.
    ///
    /// Called when the Lehmer loop fails to batch (identity matrix), meaning
    /// the first quotient is too large for matrix accumulation but `r1`'s
    /// top-bit approximation is non-zero (bit-length gap < 64). The exact
    /// quotient is guaranteed to fit in u64 since `b != 0` implies the
    /// remainders differ by at most ~63 bits.
    ///
    /// The approximate quotient `q = a / b` (from 63-bit top-bit extraction)
    /// may over- or under-estimate the true quotient by a small amount due
    /// to truncation of lower bits. Two correction loops adjust `q`:
    ///
    /// 1. **Over-estimation**: while `q × r1 > r0`, decrease `q` (subtract
    ///    `r1` from the running product).
    /// 2. **Under-estimation**: while `remainder ≥ r1`, increase `q`
    ///    (subtract `r1` from the remainder).
    ///
    /// The coefficient update uses the cheap `U256 × u64 → U320` multiply
    /// (4 limb multiplications) instead of the full `U256 × U256` multiply
    /// (16 limb multiplications) used by [`full_step`](Self::full_step).
    #[inline]
    pub fn scalar_step(&mut self, mut q: u64) {
        let r0_wide = U320::from(self.r0);
        let r1_wide = U320::from(self.r1);
        let mut qr1 = self.r1 * q;

        // Over-estimated? Decrease q until q*r1 <= r0.
        while qr1 > r0_wide {
            qr1 = qr1.overflowing_sub(&r1_wide).0;
            q -= 1;
        }

        // rem = r0 - q*r1 (non-negative)
        let mut rem = r0_wide.overflowing_sub(&qr1).0;

        // Under-estimated? Increase q until rem < r1.
        while rem >= r1_wide {
            rem = rem.overflowing_sub(&r1_wide).0;
            q += 1;
        }

        self.r0 = self.r1;
        self.r1 = rem.low_u256();

        // Coefficient update: new_t = t0 - q * t1 (wrapping 256-bit)
        let qt1 = (self.t1 * q).low_u256();
        let new_t = self.t0.overflowing_sub(&qt1).0;
        self.t0 = self.t1;
        self.t1 = new_t;

        self.even = !self.even;
    }
}

#[cfg(test)]
mod ai_tests {
    use crate::u256::U256;

    use super::*;

    /// Single scalar step on small values: gcd(100, 37), q=2.
    #[test]
    fn small_values() {
        let mut state = GcdState::new(
            U256::from_be_limbs([0, 0, 0, 100]),
            U256::from_be_limbs([0, 0, 0, 37]),
        );
        state.scalar_step(2);
        assert_eq!(state.r0, U256::from_be_limbs([0, 0, 0, 37]));
        assert_eq!(state.r1, U256::from_be_limbs([0, 0, 0, 26]));
        assert!(!state.even);
    }

    /// Exact quotient: corrects from approximate q=3 down to 2.
    #[test]
    fn corrects_overestimate() {
        let mut state = GcdState::new(
            U256::from_be_limbs([0, 0, 0, 100]),
            U256::from_be_limbs([0, 0, 0, 37]),
        );
        // True quotient is 2, but we pass 3 (over-estimate by 1).
        state.scalar_step(3);
        assert_eq!(state.r0, U256::from_be_limbs([0, 0, 0, 37]));
        assert_eq!(state.r1, U256::from_be_limbs([0, 0, 0, 26]));
    }

    /// Exact quotient: corrects from approximate q=1 up to 2.
    #[test]
    fn corrects_underestimate() {
        let mut state = GcdState::new(
            U256::from_be_limbs([0, 0, 0, 100]),
            U256::from_be_limbs([0, 0, 0, 37]),
        );
        // True quotient is 2, but we pass 1 (under-estimate by 1).
        state.scalar_step(1);
        assert_eq!(state.r0, U256::from_be_limbs([0, 0, 0, 37]));
        assert_eq!(state.r1, U256::from_be_limbs([0, 0, 0, 26]));
    }

    /// Parity toggles on each scalar step.
    #[test]
    fn parity_toggles() {
        let mut state = GcdState::new(
            U256::from_be_limbs([0, 0, 0, 100]),
            U256::from_be_limbs([0, 0, 0, 37]),
        );
        assert!(state.even);
        state.scalar_step(2);
        assert!(!state.even);
    }

    /// Coefficient tracking: t0 and t1 advance correctly.
    #[test]
    fn coefficient_update() {
        let mut state = GcdState::new(
            U256::from_be_limbs([0, 0, 0, 100]),
            U256::from_be_limbs([0, 0, 0, 37]),
        );
        // Initially t0=0, t1=1. After step: q=2.
        // new_t = t0 - q*t1 = 0 - 2*1 = -2 (wrapping)
        state.scalar_step(2);
        assert_eq!(state.t0, U256::ONE);
        let neg2 = U256::ZERO
            .overflowing_sub(&U256::from_be_limbs([0, 0, 0, 2]))
            .0;
        assert_eq!(state.t1, neg2);
    }

    /// Large correction: approximate quotient off by ~32 still converges.
    #[test]
    fn large_correction() {
        let r0 = U256::from_be_limbs([0, 0, 0, 1 << 48]);
        let r1 = U256::from_be_limbs([0, 0, 0, 1 << 15]);
        let mut state = GcdState::new(r0, r1);
        // True quotient: (1<<48) / (1<<15) = 1<<33 = 8589934592
        // Pass an approximate quotient off by 30.
        let approx_q = (1u64 << 33) + 30;
        state.scalar_step(approx_q);
        assert_eq!(state.r0, U256::from_be_limbs([0, 0, 0, 1 << 15]));
        assert_eq!(state.r1, U256::ZERO);
    }

    /// Regression: the input that triggered the original mod_inv bug.
    #[test]
    fn regression_input() {
        let p = U256::from_be_limbs([
            0xFFFFFFFFFFFFFFFF,
            0xFFFFFFFFFFFFFFFF,
            0xFFFFFFFFFFFFFFFF,
            0xFFFFFFFEFFFFFC2F,
        ]);
        let dx = U256::from_be_limbs([
            16861361263094501749,
            6454130256144472189,
            7904638317253979306,
            10596543604346685278,
        ]);
        let state = GcdState::new(p, dx);
        let inv = state.run().inverse();
        assert!(inv.is_some());
        assert_eq!(dx.mul_mod(&inv.unwrap(), &p), U256::ONE);
    }
}