cnfy-uint 0.2.3

//! Core Knuth Algorithm D quotient loop with MG10 reciprocal-based digit estimation.
use super::KnuthD;

impl KnuthD {
    /// Core Knuth Algorithm D loop with MG10 reciprocal-based quotient
    /// estimation.
    ///
    /// Processes quotient digits from most-significant to least-significant.
    /// Each iteration either uses [`div_3x2`](Self::div_3x2) for fast
    /// reciprocal estimation or falls back to full n-limb
    /// [`submul`](Self::submul) when `(u2, u1) >= (d1, d0)`. The remainder
    /// is left in `u[0..n]` (still normalized).
    pub(super) fn knuth_loop(&self, u: &mut [u64], m: usize, q: &mut [u64]) {
        for j in (0..=m).rev() {
            let u2 = u[j + self.n];
            let u1 = u[j + self.n - 1];
            let u0 = u[j + self.n - 2];
            let u21 = ((u2 as u128) << 64) | (u1 as u128);

            let mut qj;

            if u21 >= self.d {
                // Overflow: (u2, u1) >= (d1, d0), quotient digit is u64::MAX.
                // Full n-limb multiply-subtract.
                qj = u64::MAX;
                let borrow = self.submul(&mut u[j..], qj);
                let top = u[j + self.n];
                u[j + self.n] = top.wrapping_sub(borrow);
                if borrow > top {
                    qj -= 1;
                    self.add_back(&mut u[j..]);
                }
            } else {
                // Normal: div_3x2 reciprocal estimation
                let (q_est, r) = self.div_3x2(u2, u1, u0);
                qj = q_est;

                // Subtract qj * vn[0..n-2] from u[j..j+n-2]
                let mut borrow: u64 = 0;
                for i in 0..self.n - 2 {
                    let prod = (qj as u128) * (self.vn[i] as u128) + (borrow as u128);
                    let (diff, b) = u[j + i].overflowing_sub(prod as u64);
                    u[j + i] = diff;
                    borrow = (prod >> 64) as u64 + b as u64;
                }

                // Subtract borrow from the 128-bit remainder
                let (r_adj, underflow) = r.overflowing_sub(borrow as u128);
                u[j + self.n - 2] = r_adj as u64;
                u[j + self.n - 1] = (r_adj >> 64) as u64;
                u[j + self.n] = 0;

                if underflow {
                    qj -= 1;
                    self.add_back(&mut u[j..]);
                }
            }

            q[j] = qj;
        }
    }
}

#[cfg(test)]
mod ai_tests {
    use crate::u256::U256;
    use crate::u512::U512;

    use super::*;

    /// Full pipeline: normalize, knuth_loop, unnormalize gives correct remainder.
    #[test]
    fn full_pipeline() {
        // 2^64 mod (2^64 + 1) = 2^64 - (2^64+1) * 0 = 2^64, but 2^64 < 2^64+1
        // so remainder is 2^64
        let divisor = U256::from_be_limbs([0, 0, 1, 1]);
        let dividend = U512::from_be_limbs([0, 0, 0, 0, 0, 0, 1, 0]);
        let n = divisor.sig_limbs();
        let v_le = divisor.to_le_limbs();
        let kd = KnuthD::new(&v_le, n);

        let u_le = dividend.to_le_limbs();
        let m = 8 - kd.n;
        let mut un = [0u64; 9];
        kd.normalize_dividend(&u_le, &mut un);

        let mut q_arr = [0u64; 8];
        kd.knuth_loop(&mut un[..m + kd.n + 1], m, &mut q_arr[..m + 1]);

        let rem_le = kd.unnormalize(&un);
        let result = U256(rem_le);
        assert_eq!(result, U256::from_be_limbs([0, 0, 1, 0]));
    }
}