Skip to main content

decimal_scaled/wide_int/
mod.rs

1//! Generic wide-integer arithmetic.
2//!
3//! A self-contained fixed-width big-integer layer: signed `Int*` and
4//! unsigned `Uint*` two's-complement integers from 256 to 4096 bits,
5//! plus the `WideInt` trait that casts losslessly between any two
6//! widths (or a primitive `i128` / `i64` / `u128`). The module depends
7//! on nothing else in the crate and is structured so it can later be
8//! lifted into a standalone crate.
9//!
10//! # Structure
11//!
12//! - **Slice primitives** — the actual arithmetic, written once over
13//!   `&[u128]` limb slices (little-endian, `limbs[0]` least
14//!   significant). Operating on slices sidesteps the const-generic
15//!   return-type problem a widening multiply would otherwise hit. The
16//!   core routines are `const fn` so the integer types built on them
17//!   can expose `const` constructors and constants.
18//! - **`macros`** — the `decl_wide_int!` macro: emits a concrete
19//!   two's-complement signed integer type and its unsigned sibling for
20//!   a fixed limb count, delegating arithmetic to the slice primitives.
21//! - The concrete `Uint* / Int*` type pairs generated by that macro.
22
23// ─────────────────────────────────────────────────────────────────────
24// Slice primitives — unsigned limb-array arithmetic.
25//
26// Every routine treats its slices as little-endian unsigned integers.
27// Lengths are taken from the slices; callers size output buffers.
28// ─────────────────────────────────────────────────────────────────────
29
30/// Full 128×128 → 256 unsigned product, `(high, low)`.
31#[inline]
32const fn mul_128(a: u128, b: u128) -> (u128, u128) {
33    let (a_hi, a_lo) = (a >> 64, a & u64::MAX as u128);
34    let (b_hi, b_lo) = (b >> 64, b & u64::MAX as u128);
35    let (mid, carry1) = (a_lo * b_hi).overflowing_add(a_hi * b_lo);
36    let (low, carry2) = (a_lo * b_lo).overflowing_add(mid << 64);
37    let high = a_hi * b_hi + (mid >> 64) + ((carry1 as u128) << 64) + carry2 as u128;
38    (high, low)
39}
40
41/// `a == 0`.
42#[inline]
43pub(crate) const fn limbs_is_zero(a: &[u128]) -> bool {
44    let mut i = 0;
45    while i < a.len() {
46        if a[i] != 0 {
47            return false;
48        }
49        i += 1;
50    }
51    true
52}
53
54/// `a == b` for two limb slices of possibly different lengths.
55#[inline]
56pub(crate) const fn limbs_eq(a: &[u128], b: &[u128]) -> bool {
57    let n = if a.len() > b.len() { a.len() } else { b.len() };
58    let mut i = 0;
59    while i < n {
60        let av = if i < a.len() { a[i] } else { 0 };
61        let bv = if i < b.len() { b[i] } else { 0 };
62        if av != bv {
63            return false;
64        }
65        i += 1;
66    }
67    true
68}
69
70/// Three-way comparison of two limb slices of possibly different
71/// lengths. Returns `-1`, `0`, or `1` for less / equal / greater.
72#[inline]
73pub(crate) const fn limbs_cmp(a: &[u128], b: &[u128]) -> i32 {
74    let n = if a.len() > b.len() { a.len() } else { b.len() };
75    let mut i = n;
76    while i > 0 {
77        i -= 1;
78        let av = if i < a.len() { a[i] } else { 0 };
79        let bv = if i < b.len() { b[i] } else { 0 };
80        if av < bv {
81            return -1;
82        }
83        if av > bv {
84            return 1;
85        }
86    }
87    0
88}
89
90/// Bit length (`0` for zero, else `floor(log2)+1`).
91#[inline]
92pub(crate) const fn limbs_bit_len(a: &[u128]) -> u32 {
93    let mut i = a.len();
94    while i > 0 {
95        i -= 1;
96        if a[i] != 0 {
97            return (i as u32) * 128 + (128 - a[i].leading_zeros());
98        }
99    }
100    0
101}
102
103/// `a += b`, returning the carry out. `a.len() >= b.len()`.
104#[inline]
105pub(crate) const fn limbs_add_assign(a: &mut [u128], b: &[u128]) -> bool {
106    let mut carry = 0u128;
107    let mut i = 0;
108    while i < a.len() {
109        let bv = if i < b.len() { b[i] } else { 0 };
110        let (s1, c1) = a[i].overflowing_add(bv);
111        let (s2, c2) = s1.overflowing_add(carry);
112        a[i] = s2;
113        carry = (c1 as u128) + (c2 as u128);
114        i += 1;
115    }
116    carry != 0
117}
118
119/// `a -= b`, returning the borrow out. `a.len() >= b.len()`.
120#[inline]
121pub(crate) const fn limbs_sub_assign(a: &mut [u128], b: &[u128]) -> bool {
122    let mut borrow = 0u128;
123    let mut i = 0;
124    while i < a.len() {
125        let bv = if i < b.len() { b[i] } else { 0 };
126        let (d1, b1) = a[i].overflowing_sub(bv);
127        let (d2, b2) = d1.overflowing_sub(borrow);
128        a[i] = d2;
129        borrow = (b1 as u128) + (b2 as u128);
130        i += 1;
131    }
132    borrow != 0
133}
134
135/// `out = a << shift`. `out` is zeroed then filled; bits shifted past
136/// `out`'s width are dropped.
137pub(crate) const fn limbs_shl(a: &[u128], shift: u32, out: &mut [u128]) {
138    let mut z = 0;
139    while z < out.len() {
140        out[z] = 0;
141        z += 1;
142    }
143    let limb_shift = (shift / 128) as usize;
144    let bit = shift % 128;
145    let mut i = 0;
146    while i < a.len() {
147        let dst = i + limb_shift;
148        if dst < out.len() {
149            if bit == 0 {
150                out[dst] |= a[i];
151            } else {
152                out[dst] |= a[i] << bit;
153                if dst + 1 < out.len() {
154                    out[dst + 1] |= a[i] >> (128 - bit);
155                }
156            }
157        }
158        i += 1;
159    }
160}
161
162/// `out = a >> shift`. `out` is zeroed then filled.
163pub(crate) const fn limbs_shr(a: &[u128], shift: u32, out: &mut [u128]) {
164    let mut z = 0;
165    while z < out.len() {
166        out[z] = 0;
167        z += 1;
168    }
169    let limb_shift = (shift / 128) as usize;
170    let bit = shift % 128;
171    let mut i = limb_shift;
172    while i < a.len() {
173        let dst = i - limb_shift;
174        if dst < out.len() {
175            if bit == 0 {
176                out[dst] |= a[i];
177            } else {
178                out[dst] |= a[i] >> bit;
179                if dst >= 1 {
180                    out[dst - 1] |= a[i] << (128 - bit);
181                }
182            }
183        }
184        i += 1;
185    }
186}
187
188/// `out = a · b` (schoolbook). `out.len() >= a.len() + b.len()` and
189/// `out` must be zeroed by the caller.
190pub(crate) const fn limbs_mul(a: &[u128], b: &[u128], out: &mut [u128]) {
191    // Fast path for the 2-limb × 2-limb → 4-limb shape used by
192    // `Int256::wrapping_mul` (the densest wide-int call site). Hand
193    // unrolled so the compiler sees four independent `mul_128`
194    // sub-products that can issue in parallel; the inner loop variant
195    // can't express that.
196    if a.len() == 2 && b.len() == 2 && out.len() >= 4 {
197        let (a0, a1) = (a[0], a[1]);
198        let (b0, b1) = (b[0], b[1]);
199        // (a1·2^128 + a0)·(b1·2^128 + b0) = h3·2^384 + (h1+h2+l3)·2^256
200        // + (h0+l1+l2)·2^128 + l0
201        let (h0, l0) = mul_128(a0, b0);
202        let (h1, l1) = mul_128(a0, b1);
203        let (h2, l2) = mul_128(a1, b0);
204        let (h3, l3) = mul_128(a1, b1);
205
206        out[0] = l0;
207
208        let (s1, c1a) = h0.overflowing_add(l1);
209        let (s1, c1b) = s1.overflowing_add(l2);
210        out[1] = s1;
211        let mid_carry = (c1a as u128) + (c1b as u128);
212
213        let (s2, c2a) = h1.overflowing_add(h2);
214        let (s2, c2b) = s2.overflowing_add(l3);
215        let (s2, c2c) = s2.overflowing_add(mid_carry);
216        out[2] = s2;
217        let top_carry = (c2a as u128) + (c2b as u128) + (c2c as u128);
218
219        out[3] = h3.wrapping_add(top_carry);
220        return;
221    }
222
223    let mut i = 0;
224    while i < a.len() {
225        if a[i] != 0 {
226            let mut carry = 0u128;
227            let mut j = 0;
228            while j < b.len() {
229                // Skip zero-limb contributions: every `*` for `b[j] = 0`
230                // produces (0, 0) and the only effect is propagating the
231                // carry. Adds a hot fast path for widened-from-narrower
232                // operands (their upper limbs are zero) and for
233                // small-magnitude multipliers like `10^SCALE` at modest
234                // scales (one nonzero limb in `b`).
235                if b[j] != 0 || carry != 0 {
236                    let (hi, lo) = mul_128(a[i], b[j]);
237                    let idx = i + j;
238                    let (s1, c1) = out[idx].overflowing_add(lo);
239                    let (s2, c2) = s1.overflowing_add(carry);
240                    out[idx] = s2;
241                    carry = hi + (c1 as u128) + (c2 as u128);
242                }
243                j += 1;
244            }
245            let mut idx = i + b.len();
246            while carry != 0 && idx < out.len() {
247                let (s, c) = out[idx].overflowing_add(carry);
248                out[idx] = s;
249                carry = c as u128;
250                idx += 1;
251            }
252        }
253        i += 1;
254    }
255}
256
257/// Karatsuba multiplication threshold. For `a.len() < KARATSUBA_MIN`
258/// the schoolbook kernel wins because Karatsuba's recursive split
259/// allocates six `Vec`s of scratch per level (`z0`, `z1`, `z2`,
260/// `sum_a`, `sum_b`, plus the merge buffer). On `[u128]` limbs the
261/// `mul_128` savings (3·(n/2)² vs n² calls per level) only outpay
262/// that heap-alloc tax at n ≥ 32 limbs. Below that, the alloc + carry
263/// merges cost more than the limb-mul work they save — verified by
264/// `examples/karabench.rs`: at n=16 the schoolbook beats single-level
265/// Karatsuba ~800 ns vs ~880 ns; at n=32 Karatsuba wins back ~15%
266/// (~2.6 µs vs ~3.0 µs). Below 32 limbs the dispatcher therefore
267/// short-circuits straight to schoolbook.
268const KARATSUBA_MIN: usize = 32;
269
270/// `out = a · b` for equal-length inputs, dispatching to Karatsuba
271/// when the operand size warrants it.
272///
273/// Not `const fn` — Karatsuba's half-sum scratch needs heap allocation.
274/// Callers in `const` context (parsing string-literal constants, etc.)
275/// keep using [`limbs_mul`].
276///
277/// `out.len() >= 2 · a.len()`. `a.len() == b.len()` required for the
278/// Karatsuba path; mismatched lengths fall through to schoolbook.
279#[cfg(feature = "alloc")]
280pub(crate) fn limbs_mul_fast(a: &[u128], b: &[u128], out: &mut [u128]) {
281    if a.len() == b.len() && a.len() >= KARATSUBA_MIN {
282        limbs_mul_karatsuba(a, b, out);
283    } else {
284        limbs_mul(a, b, out);
285    }
286}
287
288#[cfg(not(feature = "alloc"))]
289pub(crate) fn limbs_mul_fast(a: &[u128], b: &[u128], out: &mut [u128]) {
290    limbs_mul(a, b, out);
291}
292
293/// Karatsuba multiplication, equal-length inputs.
294///
295/// Split `a = a_hi·B^h + a_lo`, `b = b_hi·B^h + b_lo` with
296/// `B = 2^128` and `h = a.len() / 2`. Compute three sub-products:
297///
298/// - `z0 = a_lo · b_lo`
299/// - `z2 = a_hi · b_hi`
300/// - `z1 = (a_lo + a_hi)·(b_lo + b_hi) − z0 − z2`
301///
302/// Then `a·b = z2·B^(2h) + z1·B^h + z0`.
303///
304/// Reference: Karatsuba, A. and Ofman, Yu. (1962). "Multiplication of
305/// Multidigit Numbers on Automata." *Doklady Akad. Nauk SSSR* 145,
306/// 293–294.
307#[cfg(feature = "alloc")]
308fn limbs_mul_karatsuba(a: &[u128], b: &[u128], out: &mut [u128]) {
309    debug_assert_eq!(a.len(), b.len());
310    debug_assert!(out.len() >= 2 * a.len());
311    let n = a.len();
312    if n < KARATSUBA_MIN {
313        // Zero out and run schoolbook.
314        for o in out.iter_mut().take(2 * n) {
315            *o = 0;
316        }
317        limbs_mul(a, b, out);
318        return;
319    }
320    let h = n / 2;
321    let (a_lo, a_hi_full) = a.split_at(h);
322    let (b_lo, b_hi_full) = b.split_at(h);
323    let a_hi = a_hi_full;
324    let b_hi = b_hi_full;
325
326    // z0 = a_lo · b_lo (length 2h)
327    let mut z0 = alloc::vec![0u128; 2 * h];
328    limbs_mul_karatsuba_padded(a_lo, b_lo, &mut z0);
329
330    // z2 = a_hi · b_hi (length 2*(n-h))
331    let hi_len = n - h;
332    let mut z2 = alloc::vec![0u128; 2 * hi_len];
333    limbs_mul_karatsuba_padded(a_hi, b_hi, &mut z2);
334
335    // sum_a = a_lo + a_hi (length max(h, hi_len) + 1)
336    let sum_len = core::cmp::max(h, hi_len) + 1;
337    let mut sum_a = alloc::vec![0u128; sum_len];
338    let mut sum_b = alloc::vec![0u128; sum_len];
339    sum_a[..h].copy_from_slice(a_lo);
340    sum_b[..h].copy_from_slice(b_lo);
341    limbs_add_assign(&mut sum_a[..], a_hi);
342    limbs_add_assign(&mut sum_b[..], b_hi);
343
344    // z1 = sum_a · sum_b (length 2 * sum_len)
345    let mut z1 = alloc::vec![0u128; 2 * sum_len];
346    limbs_mul_karatsuba_padded(&sum_a, &sum_b, &mut z1);
347
348    // z1 -= z0
349    limbs_sub_assign(&mut z1[..], &z0);
350    // z1 -= z2
351    limbs_sub_assign(&mut z1[..], &z2);
352
353    // Combine: out[..2h] = z0; out[2h..] = z2 shifted up by 2h;
354    // then add z1 shifted up by h.
355    for o in out.iter_mut().take(2 * n) {
356        *o = 0;
357    }
358    let z0_take = core::cmp::min(z0.len(), out.len());
359    out[..z0_take].copy_from_slice(&z0[..z0_take]);
360    let z2_take = core::cmp::min(z2.len(), out.len().saturating_sub(2 * h));
361    if z2_take > 0 {
362        out[2 * h..2 * h + z2_take].copy_from_slice(&z2[..z2_take]);
363    }
364    // Add z1 << h.
365    let z1_take = core::cmp::min(z1.len(), out.len().saturating_sub(h));
366    if z1_take > 0 {
367        limbs_add_assign(&mut out[h..h + z1_take], &z1[..z1_take]);
368    }
369}
370
371/// Karatsuba helper that pads to equal lengths if the caller passes
372/// uneven slices (happens at the recursion boundary when `n` is odd
373/// and `n_hi = n - h > h`).
374#[cfg(feature = "alloc")]
375fn limbs_mul_karatsuba_padded(a: &[u128], b: &[u128], out: &mut [u128]) {
376    if a.len() == b.len() && a.len() >= KARATSUBA_MIN {
377        limbs_mul_karatsuba(a, b, out);
378    } else {
379        for o in out.iter_mut() {
380            *o = 0;
381        }
382        limbs_mul(a, b, out);
383    }
384}
385
386/// Single-bit left shift in place; returns the bit shifted out of the
387/// top.
388#[inline]
389const fn limbs_shl1(a: &mut [u128]) -> u128 {
390    let mut carry = 0u128;
391    let mut i = 0;
392    while i < a.len() {
393        let new_carry = a[i] >> 127;
394        a[i] = (a[i] << 1) | carry;
395        carry = new_carry;
396        i += 1;
397    }
398    carry
399}
400
401/// `true` if every limb above index 0 is zero — the value fits a
402/// single 128-bit word.
403#[inline]
404const fn limbs_fit_one(a: &[u128]) -> bool {
405    let mut i = 1;
406    while i < a.len() {
407        if a[i] != 0 {
408            return false;
409        }
410        i += 1;
411    }
412    true
413}
414
415/// `quot = num / den`, `rem = num % den`. `quot.len() >= num.len()`,
416/// `rem.len() >= num.len()`; both are zeroed by this routine. `den`
417/// must be non-zero.
418///
419/// Two hardware fast paths short-circuit the binary long-division
420/// loop — they cover the dominant decimal cases (moderate magnitudes,
421/// divisor `10^scale` for `scale <= 19`):
422///
423/// - both operands fit a single 128-bit word → one hardware divide;
424/// - the divisor fits a 64-bit word → schoolbook base-2^64 division,
425///   one hardware divide per limb-half.
426///
427/// Otherwise it falls back to a binary shift-subtract loop bounded by
428/// the dividend's actual bit length.
429pub(crate) const fn limbs_divmod(
430    num: &[u128],
431    den: &[u128],
432    quot: &mut [u128],
433    rem: &mut [u128],
434) {
435    let mut z = 0;
436    while z < quot.len() {
437        quot[z] = 0;
438        z += 1;
439    }
440    z = 0;
441    while z < rem.len() {
442        rem[z] = 0;
443        z += 1;
444    }
445
446    let den_one_limb = limbs_fit_one(den);
447
448    // Fast path A: both dividend and divisor fit one 128-bit word.
449    if den_one_limb && limbs_fit_one(num) {
450        if !quot.is_empty() {
451            quot[0] = num[0] / den[0];
452        }
453        if !rem.is_empty() {
454            rem[0] = num[0] % den[0];
455        }
456        return;
457    }
458
459    // Fast path B: divisor fits a 64-bit word — schoolbook base-2^64
460    // long division, one hardware divide per 64-bit half of the
461    // dividend. Every `10^scale` for `scale <= 19` lands here.
462    if den_one_limb && den[0] <= u64::MAX as u128 {
463        let d = den[0];
464        let mut r: u128 = 0;
465        // Skip leading zero limbs of the numerator — every wide-tier
466        // call widens narrower operands into a `2 × $L`-limb buffer,
467        // so the top limbs are zero by construction. Each skipped
468        // limb saves two hardware divides.
469        let mut top = num.len();
470        while top > 0 && num[top - 1] == 0 {
471            top -= 1;
472        }
473        let mut i = top;
474        while i > 0 {
475            i -= 1;
476            let hi = num[i] >> 64;
477            let acc_hi = (r << 64) | hi;
478            let q_hi = acc_hi / d;
479            r = acc_hi % d;
480            let lo = num[i] & u64::MAX as u128;
481            let acc_lo = (r << 64) | lo;
482            let q_lo = acc_lo / d;
483            r = acc_lo % d;
484            if i < quot.len() {
485                quot[i] = (q_hi << 64) | q_lo;
486            }
487        }
488        if !rem.is_empty() {
489            rem[0] = r;
490        }
491        return;
492    }
493
494    // General path: binary shift-subtract, bounded by the dividend's
495    // actual bit length.
496    let bits = limbs_bit_len(num);
497    let mut i = bits;
498    while i > 0 {
499        i -= 1;
500        limbs_shl1(rem);
501        let bit = (num[(i / 128) as usize] >> (i % 128)) & 1;
502        rem[0] |= bit;
503        limbs_shl1(quot);
504        if limbs_cmp(rem, den) >= 0 {
505            limbs_sub_assign(rem, den);
506            quot[0] |= 1;
507        }
508    }
509}
510
511/// Capacity of the internal scratch buffers — 72 limbs (9216 bits),
512/// comfortably above the widest work integer in the crate (4096-bit →
513/// 32 limbs, with isqrt scratch ≤ 33).
514const SCRATCH_LIMBS: usize = 72;
515
516/// Runtime divide dispatcher. Picks the cheapest correct algorithm
517/// for the operand shape:
518///
519/// * **Single-limb divisor** — defer to the existing `const fn`
520///   [`limbs_divmod`] which carries hardware-divide fast paths for
521///   `1 / 1` and `n / u64` (every `10^scale` with `scale ≤ 19`).
522/// * **Multi-limb divisor below `BZ_THRESHOLD`** — [`limbs_divmod_knuth`].
523///   Algorithm D in base 2^128: `O(m·n)` multi-limb ops, vs the
524///   const path's `O((m+n)·n·128)` shift-subtract fallback. Net win
525///   on every wide-tier divide (D76 and above) with `SCALE > 19`.
526/// * **Very-wide divisor (`n ≥ BZ_THRESHOLD` and `top ≥ 2·n`)** —
527///   [`limbs_divmod_bz`]. Burnikel–Ziegler chunked schoolbook over
528///   Knuth. Wins at D307 deep scales where the divisor exceeds 8
529///   limbs.
530///
531/// Not `const fn` (the kernels aren't either). The wide-int
532/// `Div` / `Rem` operator impls take this path; the const-fn
533/// `wrapping_div` / `wrapping_rem` siblings stay on
534/// [`limbs_divmod`] for compile-time evaluation.
535pub(crate) fn limbs_divmod_dispatch(
536    num: &[u128],
537    den: &[u128],
538    quot: &mut [u128],
539    rem: &mut [u128],
540) {
541    const BZ_THRESHOLD: usize = 8;
542
543    let mut n = den.len();
544    while n > 0 && den[n - 1] == 0 {
545        n -= 1;
546    }
547    assert!(n > 0, "limbs_divmod_dispatch: divide by zero");
548
549    let mut top = num.len();
550    while top > 0 && num[top - 1] == 0 {
551        top -= 1;
552    }
553
554    // Fast path A: both operands fit one 128-bit word.
555    if n == 1 && top <= 1 {
556        limbs_divmod(num, den, quot, rem);
557        return;
558    }
559
560    // Fast path B (covered by const limbs_divmod): single-limb
561    // divisor that also fits a u64. Every `10^scale` with
562    // `scale ≤ 19` lands here. Larger single-limb divisors
563    // (10^20 ≤ den ≤ 10^38) fall through to Knuth — the const
564    // path's only remaining option for those is the O(bits)
565    // shift-subtract, which is the bottleneck we're closing.
566    if n == 1 && den[0] <= u64::MAX as u128 {
567        limbs_divmod(num, den, quot, rem);
568        return;
569    }
570
571    // Multi-limb arithmetic OR oversized single-limb divisor —
572    // Knuth handles both efficiently; BZ wraps Knuth for very
573    // wide divisors.
574    if n >= BZ_THRESHOLD && top >= 2 * n {
575        limbs_divmod_bz(num, den, quot, rem);
576    } else {
577        limbs_divmod_knuth(num, den, quot, rem);
578    }
579}
580
581/// Möller–Granlund 2-by-1 invariant divisor.
582///
583/// Precomputes the reciprocal `v = ⌊(B² − 1) / d⌋ − B` (where
584/// `B = 2¹²⁸`) so each subsequent `(u₁·B + u₀) / d` reduces to two
585/// `mul_128`s plus a constant fix-up — vs the 128-iteration
586/// shift-subtract of [`div_2_by_1`].
587///
588/// Reference: Möller, N. and Granlund, T. (2011). *Improved Division
589/// by Invariant Integers*, IEEE Trans. Computers 60(2), 165–175,
590/// Algorithm 4 (div) and Algorithm 6 (reciprocal). PDF:
591/// <https://gmplib.org/~tege/division-paper.pdf>.
592///
593/// Setup amortises one bit-recovery `div_2_by_1` across every
594/// quotient limb of a [`limbs_divmod_knuth`] call, which is the
595/// difference between the wide-tier divide being ~50× a 2-by-1 step
596/// per limb (today) and ~2 multiplies per limb (with this struct).
597#[derive(Clone, Copy)]
598pub(crate) struct MG2by1 {
599    /// Normalised divisor (top bit set).
600    d: u128,
601    /// Reciprocal `v = ⌊(B² − 1) / d⌋ − B`.
602    v: u128,
603}
604
605impl MG2by1 {
606    /// Setup. `d` must be normalised (`d >> 127 == 1`).
607    ///
608    /// Computes `v` as `⌊(B² − 1 − d·B) / d⌋`, using the algebraic
609    /// rewrite `(B² − 1)/d − B = (B² − 1 − d·B)/d`. The numerator
610    /// `B² − 1 − d·B = (B − d − 1)·B + (B − 1)`, a u256 with
611    /// `high = !d` (since `!d = (B − 1) − d`) and `low = u128::MAX`.
612    /// `high < d` for any normalised `d`, so the existing
613    /// bit-recovery [`div_2_by_1`] handles it without precondition
614    /// violation.
615    #[inline]
616    pub(crate) const fn new(d: u128) -> Self {
617        debug_assert!(d >> 127 == 1, "MG2by1::new: divisor must be normalised");
618        let (v, _r) = div_2_by_1(!d, u128::MAX, d);
619        Self { d, v }
620    }
621
622    /// Divide `(u1·B + u0)` by the stored divisor. `u1 < d` is
623    /// required (else the quotient wouldn't fit `u128`).
624    ///
625    /// Per MG Algorithm 4:
626    /// 1. `(q1, q0) = v·u1 + ⟨u1, u0⟩` (u257; high word may wrap u128)
627    /// 2. `q1 += 1`
628    /// 3. `r = u0 − q1·d  (mod B)`
629    /// 4. if `r > q0`: `q1 -= 1`; `r += d` (wraps mod B)
630    /// 5. if `r >= d`: `q1 += 1`; `r -= d`
631    ///
632    /// Wrap-around in step 1/2/4 is fine: step 3 only depends on
633    /// `q1 mod B` (since `q1·d mod B == (q1 mod B)·d mod B`), and the
634    /// `r > q0` / `r >= d` corrections recover the true quotient
635    /// modulo `B`. The final `q1` always fits `u128` because the true
636    /// quotient is `< B` (per the `u1 < d` precondition).
637    #[inline]
638    pub(crate) const fn div_rem(&self, u1: u128, u0: u128) -> (u128, u128) {
639        debug_assert!(u1 < self.d, "MG2by1::div_rem: high word must be < divisor");
640        // Step 1.
641        let (vu1_hi, vu1_lo) = mul_128(self.v, u1);
642        let (q0, c_lo) = vu1_lo.overflowing_add(u0);
643        let (q1, _c_hi_a) = vu1_hi.overflowing_add(u1);
644        let (q1, _c_hi_b) = q1.overflowing_add(c_lo as u128);
645        // Step 2.
646        let q1 = q1.wrapping_add(1);
647        // Step 3.
648        let r = u0.wrapping_sub(q1.wrapping_mul(self.d));
649        // Step 4.
650        let (q1, r) = if r > q0 {
651            (q1.wrapping_sub(1), r.wrapping_add(self.d))
652        } else {
653            (q1, r)
654        };
655        // Step 5.
656        if r >= self.d {
657            (q1.wrapping_add(1), r.wrapping_sub(self.d))
658        } else {
659            (q1, r)
660        }
661    }
662}
663
664/// 2-by-1 unsigned divide: `(high · 2^128 + low) / d` and the matching
665/// remainder. Requires `high < d` so the quotient fits a single
666/// `u128`.
667///
668/// Implementation: bit-by-bit recovery (128 iterations, constant work
669/// per iter). Kept as the const-context fallback and as the setup
670/// path for [`MG2by1::new`]. Runtime callers that need many divides
671/// against the same divisor should use [`MG2by1`] instead — it cuts
672/// each subsequent divide from 128 iterations down to ~2 multiplies.
673#[inline]
674const fn div_2_by_1(high: u128, low: u128, d: u128) -> (u128, u128) {
675    // The classical recovery loop: at each step shift `r` left by 1,
676    // pull the next bit of `low` in, then conditionally subtract `d`
677    // and set the matching quotient bit. The catch is that `r` can
678    // grow past `2^128 − 1` between the shift and the subtract; we
679    // track that as the `r_top` carry-out bit so the comparison stays
680    // correct.
681    let mut q: u128 = 0;
682    let mut r = high;
683    let mut i = 128;
684    while i > 0 {
685        i -= 1;
686        let r_top = r >> 127;
687        r = (r << 1) | ((low >> i) & 1);
688        q <<= 1;
689        // r real = r_top·2^128 + r. Subtract if r_real ≥ d, i.e.
690        // r_top == 1 OR r ≥ d.
691        if r_top != 0 || r >= d {
692            r = r.wrapping_sub(d);
693            q |= 1;
694        }
695    }
696    (q, r)
697}
698
699/// Knuth Algorithm D — base-2^128 multi-limb long division. The
700/// algorithm-of-record base case for `limbs_divmod_bz` below.
701///
702/// Computes `quot = num / den`, `rem = num % den`. Requires
703/// `den` non-zero. `quot` and `rem` are zeroed by this routine.
704///
705/// This is the textbook Knuth Algorithm D (TAOCP Vol. 2, §4.3.1)
706/// adapted to base `2^128`: normalise the divisor so its top bit
707/// is set, then for each quotient limb estimate `q̂` from the top
708/// two limbs of the running dividend divided by the top limb of
709/// the divisor, refine `q̂` once if necessary against the second-
710/// from-top divisor limb, multiply-and-subtract, and add-back-and-
711/// decrement on the rare miss.
712///
713/// Complexity is `O(m·n)` multi-limb ops on `m+n / n`-limb inputs,
714/// versus the binary shift-subtract path's `O((m+n)·n·128)`. For
715/// `n = 32` limbs (Int4096) the difference is ~14×. For `n ≤ 2`
716/// limbs there's no win and the caller should keep using the
717/// existing single-limb fast paths in `limbs_divmod`.
718///
719/// Not `const fn`: the inner loops use `[u128; SCRATCH_LIMBS]`
720/// scratch buffers and mutate them via overflowing arithmetic
721/// that the const evaluator doesn't yet permit. None of the
722/// crate's const-contexts depend on this routine.
723pub(crate) fn limbs_divmod_knuth(
724    num: &[u128],
725    den: &[u128],
726    quot: &mut [u128],
727    rem: &mut [u128],
728) {
729    for q in quot.iter_mut() {
730        *q = 0;
731    }
732    for r in rem.iter_mut() {
733        *r = 0;
734    }
735
736    // Effective lengths after stripping leading zeros.
737    let mut n = den.len();
738    while n > 0 && den[n - 1] == 0 {
739        n -= 1;
740    }
741    assert!(n > 0, "limbs_divmod_knuth: divide by zero");
742
743    let mut top = num.len();
744    while top > 0 && num[top - 1] == 0 {
745        top -= 1;
746    }
747    if top < n {
748        // quotient is zero, remainder is num.
749        let copy_n = num.len().min(rem.len());
750        let mut i = 0;
751        while i < copy_n {
752            rem[i] = num[i];
753            i += 1;
754        }
755        return;
756    }
757
758    // D1. Normalise: shift divisor (and dividend) left by `shift` bits
759    // so the divisor's top limb has its high bit set. Knuth's q̂
760    // refinement guarantee only holds in that regime.
761    let shift = den[n - 1].leading_zeros();
762
763    let mut u = [0u128; SCRATCH_LIMBS];
764    let mut v = [0u128; SCRATCH_LIMBS];
765    debug_assert!(top < SCRATCH_LIMBS && n <= SCRATCH_LIMBS);
766
767    if shift == 0 {
768        u[..top].copy_from_slice(&num[..top]);
769        u[top] = 0;
770        v[..n].copy_from_slice(&den[..n]);
771    } else {
772        let mut carry: u128 = 0;
773        for i in 0..top {
774            let val = num[i];
775            u[i] = (val << shift) | carry;
776            carry = val >> (128 - shift);
777        }
778        u[top] = carry;
779        carry = 0;
780        for i in 0..n {
781            let val = den[i];
782            v[i] = (val << shift) | carry;
783            carry = val >> (128 - shift);
784        }
785    }
786
787    let m_plus_n = if u[top] != 0 { top + 1 } else { top };
788    debug_assert!(m_plus_n >= n);
789    let m = m_plus_n - n;
790
791    // Precompute the Möller–Granlund 2-by-1 reciprocal of the top
792    // divisor limb. After D1 the top limb has its high bit set
793    // (`v[n-1] >> 127 == 1` by construction), so the MG normalisation
794    // precondition is satisfied. One amortised setup cost spread
795    // across `m + 1` quotient-limb estimations.
796    let mg_top = MG2by1::new(v[n - 1]);
797
798    // D2. For j from m down to 0.
799    let mut j_plus_one = m + 1;
800    while j_plus_one > 0 {
801        j_plus_one -= 1;
802        let j = j_plus_one;
803
804        // D3. Estimate q̂.
805        let u_top = u[j + n];
806        let u_next = u[j + n - 1];
807        let v_top = v[n - 1];
808
809        let (mut q_hat, mut r_hat) = if u_top >= v_top {
810            // q̂ would exceed 2^128 − 1. Cap at the max and let the
811            // refinement / add-back step correct any over-estimate.
812            // r̂ = u_top·2^128 + u_next − q̂·v_top, computed mod 2^128
813            // with q̂ = 2^128 − 1: r̂ = u_top·2^128 + u_next − (2^128
814            // − 1)·v_top = (u_top − v_top)·2^128 + u_next + v_top.
815            // We only need r̂ ≤ 2^128 − 1 for the refinement step; if
816            // (u_top − v_top) ≥ 1, r̂ overflows and we skip the
817            // refinement (the multiply-subtract handles it).
818            let q = u128::MAX;
819            let (r, of) = u_next.overflowing_add(v_top);
820            // If overflow OR (u_top − v_top) ≥ 1 we treat r̂ as
821            // "above 2^128"; signal by returning r_overflow == true.
822            if of || u_top > v_top {
823                (q, u128::MAX) // sentinel; refinement loop will see r̂ "large" and not subtract.
824            } else {
825                (q, r)
826            }
827        } else {
828            mg_top.div_rem(u_top, u_next)
829        };
830
831        // Refinement: while q̂·v[n−2] > r̂·2^128 + u[j+n−2], decrement.
832        if n >= 2 {
833            let v_below = v[n - 2];
834            loop {
835                let (hi, lo) = mul_128(q_hat, v_below);
836                let rhs_lo = u[j + n - 2];
837                let rhs_hi = r_hat;
838                // Compare (hi, lo) vs (rhs_hi, rhs_lo).
839                if hi < rhs_hi || (hi == rhs_hi && lo <= rhs_lo) {
840                    break;
841                }
842                q_hat = q_hat.wrapping_sub(1);
843                let (new_r, of) = r_hat.overflowing_add(v_top);
844                if of {
845                    break;
846                }
847                r_hat = new_r;
848            }
849        }
850
851        // D4. Multiply-and-subtract: u[j..=j+n] -= q̂ · v[0..n].
852        let mut mul_carry: u128 = 0;
853        let mut borrow: u128 = 0;
854        for i in 0..n {
855            let (hi, lo) = mul_128(q_hat, v[i]);
856            let (prod_lo, c1) = lo.overflowing_add(mul_carry);
857            let new_mul_carry = hi + u128::from(c1);
858            let (s1, b1) = u[j + i].overflowing_sub(prod_lo);
859            let (s2, b2) = s1.overflowing_sub(borrow);
860            u[j + i] = s2;
861            borrow = u128::from(b1) + u128::from(b2);
862            mul_carry = new_mul_carry;
863        }
864        let (s1, b1) = u[j + n].overflowing_sub(mul_carry);
865        let (s2, b2) = s1.overflowing_sub(borrow);
866        u[j + n] = s2;
867        let final_borrow = u128::from(b1) + u128::from(b2);
868
869        // D5/D6. If multiply-subtract went negative, decrement q̂ and
870        // add v back.
871        if final_borrow != 0 {
872            q_hat = q_hat.wrapping_sub(1);
873            let mut carry: u128 = 0;
874            for i in 0..n {
875                let (s1, c1) = u[j + i].overflowing_add(v[i]);
876                let (s2, c2) = s1.overflowing_add(carry);
877                u[j + i] = s2;
878                carry = u128::from(c1) + u128::from(c2);
879            }
880            // Final carry cancels with the earlier borrow.
881            u[j + n] = u[j + n].wrapping_add(carry);
882        }
883
884        if j < quot.len() {
885            quot[j] = q_hat;
886        }
887    }
888
889    // D8. Denormalise the remainder: u[0..n] >> shift → rem.
890    if shift == 0 {
891        let copy_n = n.min(rem.len());
892        rem[..copy_n].copy_from_slice(&u[..copy_n]);
893    } else {
894        for i in 0..n {
895            if i < rem.len() {
896                let lo = u[i] >> shift;
897                let hi_into_lo = if i + 1 < n {
898                    u[i + 1] << (128 - shift)
899                } else {
900                    0
901                };
902                rem[i] = lo | hi_into_lo;
903            }
904        }
905    }
906}
907
908/// Burnikel–Ziegler recursive divide (MPI-I-98-1-022, 1998).
909///
910/// Splits an unbalanced `m+n / n` divide into a chain of balanced
911/// `2n / n` sub-divides, each of which recursively halves. The base
912/// case is [`limbs_divmod_knuth`] for divisors below `BZ_THRESHOLD`.
913/// On Karatsuba-multiplied operands BZ runs in `O(n^{1.58} · log n)`
914/// time vs Knuth's `O(n²)`.
915///
916/// For the widths this crate actually uses (Int256 … Int4096, ≤ 32
917/// limbs) the recursion only saves a constant factor over Knuth and
918/// the canonical `limbs_divmod` path stays untouched. BZ is exposed
919/// here so a bench-driven follow-up can promote it once a clear win
920/// shows up on the wide-tier divides.
921///
922/// Threshold: recurses only when both `num.len() ≥ 2·BZ_THRESHOLD`
923/// and `den.len() ≥ BZ_THRESHOLD`. Below that the cost of splitting
924/// dominates and Knuth wins outright.
925pub(crate) fn limbs_divmod_bz(
926    num: &[u128],
927    den: &[u128],
928    quot: &mut [u128],
929    rem: &mut [u128],
930) {
931    const BZ_THRESHOLD: usize = 8;
932
933    let mut n = den.len();
934    while n > 0 && den[n - 1] == 0 {
935        n -= 1;
936    }
937    assert!(n > 0, "limbs_divmod_bz: divide by zero");
938
939    let mut top = num.len();
940    while top > 0 && num[top - 1] == 0 {
941        top -= 1;
942    }
943
944    if n < BZ_THRESHOLD || top < 2 * n {
945        // Base case — Knuth handles every shape efficiently.
946        limbs_divmod_knuth(num, den, quot, rem);
947        return;
948    }
949
950    // BZ recursion: split the dividend into chunks of size `n` from
951    // the top, process each chunk with a `2n / n` sub-divide, carry
952    // the remainder forward. Each `2n / n` sub-divide itself does
953    // two `(3n/2) / n` calls via the recursive structure that — at
954    // these widths — Knuth handles inside its own quotient loop, so
955    // for now BZ here is essentially the chunked schoolbook outer
956    // loop with Knuth as the kernel. The full §3 two-by-one /
957    // three-by-two recursion is recorded in ALGORITHMS.md as the
958    // next layer to add once a bench shows it winning.
959    for q in quot.iter_mut() {
960        *q = 0;
961    }
962    for r in rem.iter_mut() {
963        *r = 0;
964    }
965
966    // Number of `n`-limb chunks in the dividend, rounded up so the
967    // top chunk may be short.
968    let chunks = top.div_ceil(n);
969    let mut carry = [0u128; SCRATCH_LIMBS];
970    let mut buf = [0u128; SCRATCH_LIMBS];
971    let mut q_chunk = [0u128; SCRATCH_LIMBS];
972    let mut r_chunk = [0u128; SCRATCH_LIMBS];
973
974    let mut idx = chunks;
975    while idx > 0 {
976        idx -= 1;
977        let lo = idx * n;
978        let hi = ((idx + 1) * n).min(top);
979        // buf = carry · 2^(n·128) + num[lo..hi]. carry holds the
980        // running remainder from the previous step (≤ n limbs).
981        buf.fill(0);
982        let chunk_len = hi - lo;
983        buf[..chunk_len].copy_from_slice(&num[lo..lo + chunk_len]);
984        buf[chunk_len..chunk_len + n].copy_from_slice(&carry[..n]);
985        let buf_len = chunk_len + n;
986        // Divide.
987        limbs_divmod_knuth(
988            &buf[..buf_len],
989            &den[..n],
990            &mut q_chunk[..buf_len],
991            &mut r_chunk[..n],
992        );
993        // Store quotient chunk.
994        let store_end = (lo + n).min(quot.len());
995        let store_len = store_end.saturating_sub(lo);
996        quot[lo..lo + store_len].copy_from_slice(&q_chunk[..store_len]);
997        // Carry the remainder.
998        carry[..n].copy_from_slice(&r_chunk[..n]);
999    }
1000    let rem_n = n.min(rem.len());
1001    rem[..rem_n].copy_from_slice(&carry[..rem_n]);
1002}
1003
1004/// `out = floor(sqrt(n))` via Newton's method. `out` is zeroed then
1005/// filled.
1006pub(crate) fn limbs_isqrt(n: &[u128], out: &mut [u128]) {
1007    for o in out.iter_mut() {
1008        *o = 0;
1009    }
1010    let bits = limbs_bit_len(n);
1011    if bits == 0 {
1012        return;
1013    }
1014    if bits <= 1 {
1015        out[0] = 1;
1016        return;
1017    }
1018    let work = n.len() + 1;
1019    debug_assert!(work <= SCRATCH_LIMBS, "wide-int isqrt scratch overflow");
1020    let mut x = [0u128; SCRATCH_LIMBS];
1021    let e = bits.div_ceil(2);
1022    x[(e / 128) as usize] |= 1u128 << (e % 128);
1023    loop {
1024        let mut q = [0u128; SCRATCH_LIMBS];
1025        let mut r = [0u128; SCRATCH_LIMBS];
1026        limbs_divmod(n, &x[..work], &mut q[..work], &mut r[..work]);
1027        limbs_add_assign(&mut q[..work], &x[..work]);
1028        let mut y = [0u128; SCRATCH_LIMBS];
1029        limbs_shr(&q[..work], 1, &mut y[..work]);
1030        if limbs_cmp(&y[..work], &x[..work]) >= 0 {
1031            break;
1032        }
1033        x = y;
1034    }
1035    let copy_len = if out.len() < work { out.len() } else { work };
1036    out[..copy_len].copy_from_slice(&x[..copy_len]);
1037}
1038
1039/// `limbs /= radix` in place, returning the remainder. `radix` must fit
1040/// a `u64` so the per-limb division stays within `u128`.
1041fn limbs_div_small(limbs: &mut [u128], radix: u128) -> u128 {
1042    let mut rem = 0u128;
1043    for limb in limbs.iter_mut().rev() {
1044        let hi = (*limb) >> 64;
1045        let lo = (*limb) & u128::from(u64::MAX);
1046        let acc_hi = (rem << 64) | hi;
1047        let q_hi = acc_hi / radix;
1048        let r1 = acc_hi % radix;
1049        let acc_lo = (r1 << 64) | lo;
1050        let q_lo = acc_lo / radix;
1051        rem = acc_lo % radix;
1052        *limb = (q_hi << 64) | q_lo;
1053    }
1054    rem
1055}
1056
1057/// Formats a limb slice into `buf` in the given radix (`2..=16`),
1058/// returning the written digit subslice. The slice is treated as an
1059/// unsigned magnitude.
1060pub(crate) fn limbs_fmt_into<'a>(
1061    limbs: &[u128],
1062    radix: u128,
1063    lower: bool,
1064    buf: &'a mut [u8],
1065) -> &'a str {
1066    let digits: &[u8] = if lower {
1067        b"0123456789abcdef"
1068    } else {
1069        b"0123456789ABCDEF"
1070    };
1071    if limbs_is_zero(limbs) {
1072        let last = buf.len() - 1;
1073        buf[last] = b'0';
1074        return core::str::from_utf8(&buf[last..]).unwrap();
1075    }
1076    let mut work = [0u128; SCRATCH_LIMBS];
1077    work[..limbs.len()].copy_from_slice(limbs);
1078    let wl = limbs.len();
1079    let mut pos = buf.len();
1080    while !limbs_is_zero(&work[..wl]) {
1081        let r = limbs_div_small(&mut work[..wl], radix);
1082        pos -= 1;
1083        buf[pos] = digits[r as usize];
1084    }
1085    core::str::from_utf8(&buf[pos..]).unwrap()
1086}
1087
1088// ─────────────────────────────────────────────────────────────────────
1089// u64 limb primitives.
1090//
1091// Drop-in shape replacements for the `[u128]`-based primitives above,
1092// but operating on `&[u64]` slices instead. The point: hardware has a
1093// native `u64 × u64 → u128` widening multiply and a native `u128 / u64`
1094// hardware divide, neither of which exists for `u128 × u128 → u256` or
1095// `u256 / u128`. The u128 primitives above are forced to soft-emulate
1096// both via the `mul_128` four-mul decomposition and the 128-iteration
1097// `div_2_by_1` bit-recovery loop. The u64 versions get the hardware
1098// instructions directly.
1099//
1100// During the in-progress storage migration these live alongside the
1101// u128 versions; the wrapper types (`$U` / `$S` in `decl_wide_int!`)
1102// continue to call the u128 entry points. A follow-up commit flips
1103// the storage to `[u64; 2 * $L]` and rewires every call site.
1104// ─────────────────────────────────────────────────────────────────────
1105
1106/// `a == 0`. u64-limb counterpart of [`limbs_is_zero`].
1107#[inline]
1108pub(crate) const fn limbs_is_zero_u64(a: &[u64]) -> bool {
1109    let mut i = 0;
1110    while i < a.len() {
1111        if a[i] != 0 {
1112            return false;
1113        }
1114        i += 1;
1115    }
1116    true
1117}
1118
1119/// Fixed-width specialisation of [`limbs_is_zero_u64`]. `L` const at
1120/// callsite, lets LLVM unroll for small `L`.
1121#[inline]
1122pub(crate) const fn limbs_is_zero_u64_fixed<const L: usize>(a: &[u64; L]) -> bool {
1123    let mut i = 0;
1124    while i < L {
1125        if a[i] != 0 {
1126            return false;
1127        }
1128        i += 1;
1129    }
1130    true
1131}
1132
1133/// `a == b` for two limb slices of possibly different lengths.
1134#[inline]
1135pub(crate) const fn limbs_eq_u64(a: &[u64], b: &[u64]) -> bool {
1136    let n = if a.len() > b.len() { a.len() } else { b.len() };
1137    let mut i = 0;
1138    while i < n {
1139        let av = if i < a.len() { a[i] } else { 0 };
1140        let bv = if i < b.len() { b[i] } else { 0 };
1141        if av != bv {
1142            return false;
1143        }
1144        i += 1;
1145    }
1146    true
1147}
1148
1149/// Three-way comparison `-1`/`0`/`1`.
1150#[inline]
1151pub(crate) const fn limbs_cmp_u64(a: &[u64], b: &[u64]) -> i32 {
1152    let n = if a.len() > b.len() { a.len() } else { b.len() };
1153    let mut i = n;
1154    while i > 0 {
1155        i -= 1;
1156        let av = if i < a.len() { a[i] } else { 0 };
1157        let bv = if i < b.len() { b[i] } else { 0 };
1158        if av < bv {
1159            return -1;
1160        }
1161        if av > bv {
1162            return 1;
1163        }
1164    }
1165    0
1166}
1167
1168/// Fixed-width specialisation of [`limbs_cmp_u64`] — both operands
1169/// the same `L`; no length-difference handling needed.
1170#[inline]
1171pub(crate) const fn limbs_cmp_u64_fixed<const L: usize>(a: &[u64; L], b: &[u64; L]) -> i32 {
1172    let mut i = L;
1173    while i > 0 {
1174        i -= 1;
1175        if a[i] < b[i] {
1176            return -1;
1177        }
1178        if a[i] > b[i] {
1179            return 1;
1180        }
1181    }
1182    0
1183}
1184
1185/// Bit length (`0` for zero, else `floor(log2)+1`).
1186#[inline]
1187pub(crate) const fn limbs_bit_len_u64(a: &[u64]) -> u32 {
1188    let mut i = a.len();
1189    while i > 0 {
1190        i -= 1;
1191        if a[i] != 0 {
1192            return (i as u32) * 64 + (64 - a[i].leading_zeros());
1193        }
1194    }
1195    0
1196}
1197
1198/// Fixed-width specialisation of [`limbs_bit_len_u64`].
1199#[inline]
1200pub(crate) const fn limbs_bit_len_u64_fixed<const L: usize>(a: &[u64; L]) -> u32 {
1201    let mut i = L;
1202    while i > 0 {
1203        i -= 1;
1204        if a[i] != 0 {
1205            return (i as u32) * 64 + (64 - a[i].leading_zeros());
1206        }
1207    }
1208    0
1209}
1210
1211/// `a += b`, returns carry out. `a.len() >= b.len()`.
1212#[inline]
1213pub(crate) const fn limbs_add_assign_u64(a: &mut [u64], b: &[u64]) -> bool {
1214    let mut carry: u64 = 0;
1215    let mut i = 0;
1216    while i < a.len() {
1217        let bv = if i < b.len() { b[i] } else { 0 };
1218        let (s1, c1) = a[i].overflowing_add(bv);
1219        let (s2, c2) = s1.overflowing_add(carry);
1220        a[i] = s2;
1221        carry = (c1 as u64) + (c2 as u64);
1222        i += 1;
1223    }
1224    carry != 0
1225}
1226
1227/// Fixed-width specialisation of [`limbs_add_assign_u64`] — both
1228/// operands the same `L`.
1229#[inline]
1230pub(crate) const fn limbs_add_assign_u64_fixed<const L: usize>(
1231    a: &mut [u64; L],
1232    b: &[u64; L],
1233) -> bool {
1234    let mut carry: u64 = 0;
1235    let mut i = 0;
1236    while i < L {
1237        let (s1, c1) = a[i].overflowing_add(b[i]);
1238        let (s2, c2) = s1.overflowing_add(carry);
1239        a[i] = s2;
1240        carry = (c1 as u64) + (c2 as u64);
1241        i += 1;
1242    }
1243    carry != 0
1244}
1245
1246/// `a -= b`, returns borrow out. `a.len() >= b.len()`.
1247#[inline]
1248pub(crate) const fn limbs_sub_assign_u64(a: &mut [u64], b: &[u64]) -> bool {
1249    let mut borrow: u64 = 0;
1250    let mut i = 0;
1251    while i < a.len() {
1252        let bv = if i < b.len() { b[i] } else { 0 };
1253        let (d1, b1) = a[i].overflowing_sub(bv);
1254        let (d2, b2) = d1.overflowing_sub(borrow);
1255        a[i] = d2;
1256        borrow = (b1 as u64) + (b2 as u64);
1257        i += 1;
1258    }
1259    borrow != 0
1260}
1261
1262/// Fixed-width specialisation of [`limbs_sub_assign_u64`].
1263#[inline]
1264pub(crate) const fn limbs_sub_assign_u64_fixed<const L: usize>(
1265    a: &mut [u64; L],
1266    b: &[u64; L],
1267) -> bool {
1268    let mut borrow: u64 = 0;
1269    let mut i = 0;
1270    while i < L {
1271        let (d1, b1) = a[i].overflowing_sub(b[i]);
1272        let (d2, b2) = d1.overflowing_sub(borrow);
1273        a[i] = d2;
1274        borrow = (b1 as u64) + (b2 as u64);
1275        i += 1;
1276    }
1277    borrow != 0
1278}
1279
1280/// Fixed-width specialisation of [`limbs_shl_u64`]. `L` const, but
1281/// `shift` is still runtime — bounds checks vanish, the inner loop
1282/// trip count is known.
1283#[inline]
1284pub(crate) const fn limbs_shl_u64_fixed<const L: usize>(
1285    a: &[u64; L],
1286    shift: u32,
1287    out: &mut [u64; L],
1288) {
1289    let mut z = 0;
1290    while z < L {
1291        out[z] = 0;
1292        z += 1;
1293    }
1294    let limb_shift = (shift / 64) as usize;
1295    let bit = shift % 64;
1296    let mut i = 0;
1297    while i < L {
1298        let dst = i + limb_shift;
1299        if dst < L {
1300            if bit == 0 {
1301                out[dst] |= a[i];
1302            } else {
1303                out[dst] |= a[i] << bit;
1304                if dst + 1 < L {
1305                    out[dst + 1] |= a[i] >> (64 - bit);
1306                }
1307            }
1308        }
1309        i += 1;
1310    }
1311}
1312
1313/// Fixed-width specialisation of [`limbs_shr_u64`].
1314#[inline]
1315pub(crate) const fn limbs_shr_u64_fixed<const L: usize>(
1316    a: &[u64; L],
1317    shift: u32,
1318    out: &mut [u64; L],
1319) {
1320    let mut z = 0;
1321    while z < L {
1322        out[z] = 0;
1323        z += 1;
1324    }
1325    let limb_shift = (shift / 64) as usize;
1326    let bit = shift % 64;
1327    let mut i = limb_shift;
1328    while i < L {
1329        let dst = i - limb_shift;
1330        if dst < L {
1331            if bit == 0 {
1332                out[dst] |= a[i];
1333            } else {
1334                out[dst] |= a[i] >> bit;
1335                if dst >= 1 {
1336                    out[dst - 1] |= a[i] << (64 - bit);
1337                }
1338            }
1339        }
1340        i += 1;
1341    }
1342}
1343
1344/// `out = a << shift`. `out` is zeroed then filled.
1345pub(crate) const fn limbs_shl_u64(a: &[u64], shift: u32, out: &mut [u64]) {
1346    let mut z = 0;
1347    while z < out.len() {
1348        out[z] = 0;
1349        z += 1;
1350    }
1351    let limb_shift = (shift / 64) as usize;
1352    let bit = shift % 64;
1353    let mut i = 0;
1354    while i < a.len() {
1355        let dst = i + limb_shift;
1356        if dst < out.len() {
1357            if bit == 0 {
1358                out[dst] |= a[i];
1359            } else {
1360                out[dst] |= a[i] << bit;
1361                if dst + 1 < out.len() {
1362                    out[dst + 1] |= a[i] >> (64 - bit);
1363                }
1364            }
1365        }
1366        i += 1;
1367    }
1368}
1369
1370/// `out = a >> shift`. `out` is zeroed then filled.
1371pub(crate) const fn limbs_shr_u64(a: &[u64], shift: u32, out: &mut [u64]) {
1372    let mut z = 0;
1373    while z < out.len() {
1374        out[z] = 0;
1375        z += 1;
1376    }
1377    let limb_shift = (shift / 64) as usize;
1378    let bit = shift % 64;
1379    let mut i = limb_shift;
1380    while i < a.len() {
1381        let dst = i - limb_shift;
1382        if dst < out.len() {
1383            if bit == 0 {
1384                out[dst] |= a[i];
1385            } else {
1386                out[dst] |= a[i] >> bit;
1387                if dst >= 1 {
1388                    out[dst - 1] |= a[i] << (64 - bit);
1389                }
1390            }
1391        }
1392        i += 1;
1393    }
1394}
1395
1396/// Single-bit left shift in place; returns the bit shifted out.
1397#[inline]
1398const fn limbs_shl1_u64(a: &mut [u64]) -> u64 {
1399    let mut carry: u64 = 0;
1400    let mut i = 0;
1401    while i < a.len() {
1402        let new_carry = a[i] >> 63;
1403        a[i] = (a[i] << 1) | carry;
1404        carry = new_carry;
1405        i += 1;
1406    }
1407    carry
1408}
1409
1410/// `true` if every limb above index 0 is zero — fits a single u64.
1411#[inline]
1412const fn limbs_fit_one_u64(a: &[u64]) -> bool {
1413    let mut i = 1;
1414    while i < a.len() {
1415        if a[i] != 0 {
1416            return false;
1417        }
1418        i += 1;
1419    }
1420    true
1421}
1422
1423/// `out = a · b` schoolbook. `out.len() >= a.len() + b.len()` and
1424/// `out` must be zeroed by the caller.
1425///
1426/// Inner step uses the native `u64 × u64 → u128` widening mul
1427/// (`MUL` + `UMULH` on x86-64 / aarch64), avoiding the 4-way
1428/// `mul_128` decomposition every u128 schoolbook step pays.
1429pub(crate) const fn limbs_mul_u64(a: &[u64], b: &[u64], out: &mut [u64]) {
1430    let mut i = 0;
1431    while i < a.len() {
1432        if a[i] != 0 {
1433            let mut carry: u64 = 0;
1434            let mut j = 0;
1435            while j < b.len() {
1436                if b[j] != 0 || carry != 0 {
1437                    let prod = (a[i] as u128) * (b[j] as u128);
1438                    let prod_lo = prod as u64;
1439                    let prod_hi = (prod >> 64) as u64;
1440                    let idx = i + j;
1441                    let (s1, c1) = out[idx].overflowing_add(prod_lo);
1442                    let (s2, c2) = s1.overflowing_add(carry);
1443                    out[idx] = s2;
1444                    carry = prod_hi + (c1 as u64) + (c2 as u64);
1445                }
1446                j += 1;
1447            }
1448            let mut idx = i + b.len();
1449            while carry != 0 && idx < out.len() {
1450                let (s, c) = out[idx].overflowing_add(carry);
1451                out[idx] = s;
1452                carry = c as u64;
1453                idx += 1;
1454            }
1455        }
1456        i += 1;
1457    }
1458}
1459
1460/// Fixed-width specialisation of [`limbs_mul_u64`]: the operand
1461/// limb-count `L` and output limb-count `D = 2·L` are both
1462/// compile-time constants, so the slice indirection and loop-bound
1463/// checks vanish and LLVM can unroll the inner loop (and, for small
1464/// `L`, the outer one too).
1465///
1466/// Same algorithm and same output as [`limbs_mul_u64`]; faster only
1467/// when both operands have known-equal length (the common case for
1468/// wide-tier `widen_mul` where both operands are an `Int{N}` of the
1469/// tier's storage width).
1470#[inline]
1471pub(crate) const fn limbs_mul_u64_fixed<const L: usize, const D: usize>(
1472    a: &[u64; L],
1473    b: &[u64; L],
1474    out: &mut [u64; D],
1475) {
1476    debug_assert!(D >= 2 * L, "limbs_mul_u64_fixed: D must be ≥ 2·L");
1477    let mut i = 0;
1478    while i < L {
1479        let ai = a[i];
1480        if ai != 0 {
1481            let mut carry: u64 = 0;
1482            let mut j = 0;
1483            while j < L {
1484                let v = (ai as u128) * (b[j] as u128)
1485                    + (out[i + j] as u128)
1486                    + (carry as u128);
1487                out[i + j] = v as u64;
1488                carry = (v >> 64) as u64;
1489                j += 1;
1490            }
1491            // Final row carry, propagated until exhausted or end of
1492            // `out`. Worst-case unbounded chain when out[i + L ..]
1493            // is all-ones; ordinarily exits after 1 iteration.
1494            let mut idx = i + L;
1495            let mut c = carry;
1496            while c != 0 && idx < D {
1497                let v = (out[idx] as u128) + (c as u128);
1498                out[idx] = v as u64;
1499                c = (v >> 64) as u64;
1500                idx += 1;
1501            }
1502        }
1503        i += 1;
1504    }
1505}
1506
1507/// `out = a · n` where `n` is a single u64 multiplier, `a` is a
1508/// fixed-width `L`-limb input, and `out` is a fixed-width
1509/// `LP1 = L + 1` limb output. `out` must be zeroed by the caller.
1510///
1511/// Specialisation of the n-by-1-word multi-precision multiply
1512/// (Knuth, TAOCP Vol 2 §4.3.1, Algorithm M with `n = 1`):
1513/// every inner-loop step is a single `u64 × u64 → u128` widening
1514/// mul plus an accumulator-and-carry fold, so the whole operation
1515/// is `L` widening muls and `L` adds with no cross-row carry
1516/// chains. By contrast, [`limbs_mul_u64_fixed`] called with
1517/// `b = [n, 0, ..., 0]` still runs the `L²` outer-product loop
1518/// (most iterations are short-circuited on `b[j] == 0`, but the
1519/// monomorphisation still emits the dead branches and the row
1520/// carry-propagation tail).
1521///
1522/// `LP1` must equal `L + 1`; the caller passes both because Rust
1523/// stable cannot express `L + 1` in a const generic position.
1524#[inline(always)]
1525pub(crate) const fn limbs_mul_u64_into<const L: usize, const LP1: usize>(
1526    a: &[u64; L],
1527    n: u64,
1528    out: &mut [u64; LP1],
1529) {
1530    debug_assert!(LP1 == L + 1, "limbs_mul_u64_into: LP1 must equal L + 1");
1531    let mut carry: u64 = 0;
1532    let mut i = 0;
1533    while i < L {
1534        // p fits u128 with no overflow:
1535        //   (2^64 - 1)·(2^64 - 1) + (2^64 - 1) + (2^64 - 1)
1536        //   = 2^128 - 1
1537        let p = (a[i] as u128) * (n as u128)
1538            + (out[i] as u128)
1539            + (carry as u128);
1540        out[i] = p as u64;
1541        carry = (p >> 64) as u64;
1542        i += 1;
1543    }
1544    out[L] = carry;
1545}
1546
1547/// `quot = num / den`, `rem = num % den`, u64 limbs.
1548///
1549/// Hardware fast paths:
1550/// - both fit a single u64 → one native `u64 / u64`
1551/// - divisor fits a single u64 → native `u128 / u64` per dividend limb
1552/// - otherwise → bit shift-subtract (only reached when divisor is
1553///   multi-limb; the dispatcher routes those to Knuth instead)
1554pub(crate) const fn limbs_divmod_u64(
1555    num: &[u64],
1556    den: &[u64],
1557    quot: &mut [u64],
1558    rem: &mut [u64],
1559) {
1560    let mut z = 0;
1561    while z < quot.len() {
1562        quot[z] = 0;
1563        z += 1;
1564    }
1565    z = 0;
1566    while z < rem.len() {
1567        rem[z] = 0;
1568        z += 1;
1569    }
1570
1571    let den_one_limb = limbs_fit_one_u64(den);
1572
1573    // Fast path A: both fit a single u64 → hardware divide.
1574    if den_one_limb && limbs_fit_one_u64(num) {
1575        if !quot.is_empty() {
1576            quot[0] = num[0] / den[0];
1577        }
1578        if !rem.is_empty() {
1579            rem[0] = num[0] % den[0];
1580        }
1581        return;
1582    }
1583
1584    // Fast path B: divisor fits a single u64 — schoolbook base-2^64
1585    // long divide using the native u128/u64 hardware divide. Note this
1586    // is the SAME shape as the u128 path's Fast B (which manually
1587    // splits u128 into 64-bit halves), but here it's one hardware
1588    // divide per limb instead of two.
1589    if den_one_limb {
1590        let d = den[0];
1591        let mut r: u64 = 0;
1592        let mut top = num.len();
1593        while top > 0 && num[top - 1] == 0 {
1594            top -= 1;
1595        }
1596        let mut i = top;
1597        while i > 0 {
1598            i -= 1;
1599            let acc = ((r as u128) << 64) | (num[i] as u128);
1600            let q = (acc / (d as u128)) as u64;
1601            r = (acc % (d as u128)) as u64;
1602            if i < quot.len() {
1603                quot[i] = q;
1604            }
1605        }
1606        if !rem.is_empty() {
1607            rem[0] = r;
1608        }
1609        return;
1610    }
1611
1612    // General path: binary shift-subtract. Only reached for multi-limb
1613    // divisors when the dispatcher isn't routing to Knuth (i.e. in
1614    // const contexts where Knuth isn't available).
1615    let bits = limbs_bit_len_u64(num);
1616    let mut i = bits;
1617    while i > 0 {
1618        i -= 1;
1619        limbs_shl1_u64(rem);
1620        let bit = (num[(i / 64) as usize] >> (i % 64)) & 1;
1621        rem[0] |= bit;
1622        limbs_shl1_u64(quot);
1623        if limbs_cmp_u64(rem, den) >= 0 {
1624            limbs_sub_assign_u64(rem, den);
1625            quot[0] |= 1;
1626        }
1627    }
1628}
1629
1630/// Scratch capacity for the runtime u64-limb kernels — 144 u64 limbs
1631/// (9216 bits), matching the u128 path's 72-limb scratch.
1632// 288 u64 limbs = 18432 bits — covers the widest work integer in
1633// the crate (Int16384 used by D1232 cbrt, 256 u64 limbs) with isqrt
1634// scratch slack.
1635const SCRATCH_LIMBS_U64: usize = 288;
1636
1637/// Karatsuba threshold for the u64-base multiplier. Set above any
1638/// width the crate ships so the dispatcher never routes through
1639/// Karatsuba in practice. The implementation is retained for
1640/// possible future use (a `simd` feature, an extra-wide tier past
1641/// D1232, etc.) but the M2 gate at L=16-96 showed schoolbook wins
1642/// 1.07-1.92× at every shipped width — LLVM-unrolled schoolbook
1643/// + the heap allocations the recursive split needs put the
1644/// crossover well past our widest tier.
1645pub(crate) const KARATSUBA_THRESHOLD_U64: usize = 256;
1646
1647/// Recursive Karatsuba multiplication at u64 base.
1648///
1649/// Reference: Karatsuba & Ofman 1962, "Multiplication of Multidigit
1650/// Numbers on Automata" (Doklady Akad. Nauk SSSR 145, 293-294).
1651/// Splits both equal-length operands at half: `a = a₁·B + a₀`,
1652/// `b = b₁·B + b₀`, computes three half-width sub-products
1653/// `z₀ = a₀·b₀`, `z₂ = a₁·b₁`, `z₁ = (a₀+a₁)·(b₀+b₁) − z₀ − z₂`,
1654/// then recombines as `z₂·B² + z₁·B + z₀`.
1655///
1656/// Operands must be equal length. `out.len() >= 2 * a.len()` and
1657/// `out` must be zeroed by the caller.
1658#[cfg(feature = "alloc")]
1659pub(crate) fn limbs_mul_karatsuba_u64(a: &[u64], b: &[u64], out: &mut [u64]) {
1660    debug_assert_eq!(a.len(), b.len());
1661    debug_assert!(out.len() >= 2 * a.len());
1662    let n = a.len();
1663    if n < KARATSUBA_THRESHOLD_U64 {
1664        limbs_mul_u64(a, b, out);
1665        return;
1666    }
1667    let h = n / 2;
1668    let hi_len = n - h;
1669    let (a_lo, a_hi) = a.split_at(h);
1670    let (b_lo, b_hi) = b.split_at(h);
1671
1672    // z0 = a_lo · b_lo
1673    let mut z0 = alloc::vec![0u64; 2 * h];
1674    limbs_mul_karatsuba_padded_u64(a_lo, b_lo, &mut z0);
1675
1676    // z2 = a_hi · b_hi
1677    let mut z2 = alloc::vec![0u64; 2 * hi_len];
1678    limbs_mul_karatsuba_padded_u64(a_hi, b_hi, &mut z2);
1679
1680    // sum_a = a_lo + a_hi, sum_b = b_lo + b_hi. The sums fit in
1681    // max(h, hi_len) + 1 limbs (1 limb of carry).
1682    let sum_len = core::cmp::max(h, hi_len) + 1;
1683    let mut sum_a = alloc::vec![0u64; sum_len];
1684    let mut sum_b = alloc::vec![0u64; sum_len];
1685    sum_a[..h].copy_from_slice(a_lo);
1686    sum_b[..h].copy_from_slice(b_lo);
1687    let _ = limbs_add_assign_u64(&mut sum_a[..], a_hi);
1688    let _ = limbs_add_assign_u64(&mut sum_b[..], b_hi);
1689
1690    // z1m = sum_a · sum_b, then z1 = z1m - z0 - z2.
1691    let mut z1 = alloc::vec![0u64; 2 * sum_len];
1692    limbs_mul_karatsuba_padded_u64(&sum_a, &sum_b, &mut z1);
1693    let _ = limbs_sub_assign_u64(&mut z1[..], &z0);
1694    let _ = limbs_sub_assign_u64(&mut z1[..], &z2);
1695
1696    // Combine: out = z0 + z2·B² + z1·B  (B = 2^(h·64)).
1697    for o in out.iter_mut().take(2 * n) {
1698        *o = 0;
1699    }
1700    let z0_take = core::cmp::min(z0.len(), out.len());
1701    out[..z0_take].copy_from_slice(&z0[..z0_take]);
1702    let z2_take = core::cmp::min(z2.len(), out.len().saturating_sub(2 * h));
1703    if z2_take > 0 {
1704        out[2 * h..2 * h + z2_take].copy_from_slice(&z2[..z2_take]);
1705    }
1706    let z1_take = core::cmp::min(z1.len(), out.len().saturating_sub(h));
1707    if z1_take > 0 {
1708        let _ = limbs_add_assign_u64(&mut out[h..h + z1_take], &z1[..z1_take]);
1709    }
1710}
1711
1712/// Karatsuba helper that handles uneven operand lengths at the
1713/// recursion boundary (`n` odd → `n_hi = n − h > h`).
1714#[cfg(feature = "alloc")]
1715fn limbs_mul_karatsuba_padded_u64(a: &[u64], b: &[u64], out: &mut [u64]) {
1716    if a.len() == b.len() && a.len() >= KARATSUBA_THRESHOLD_U64 {
1717        limbs_mul_karatsuba_u64(a, b, out);
1718    } else {
1719        for o in out.iter_mut() {
1720            *o = 0;
1721        }
1722        limbs_mul_u64(a, b, out);
1723    }
1724}
1725
1726/// Equal-length u64 multiplier dispatcher. Picks Karatsuba above
1727/// the threshold; otherwise schoolbook. Both operands are assumed
1728/// to be the same length (the common `widen_mul` case).
1729pub(crate) fn limbs_mul_fast_u64(a: &[u64], b: &[u64], out: &mut [u64]) {
1730    #[cfg(feature = "alloc")]
1731    {
1732        if a.len() == b.len() && a.len() >= KARATSUBA_THRESHOLD_U64 {
1733            for o in out.iter_mut() {
1734                *o = 0;
1735            }
1736            limbs_mul_karatsuba_u64(a, b, out);
1737            return;
1738        }
1739    }
1740    limbs_mul_u64(a, b, out);
1741}
1742
1743/// Möller–Granlund 2-by-1 invariant divisor at u64 base.
1744///
1745/// Reference: same as [`MG2by1`] (Möller & Granlund 2011, Algorithm 4).
1746///
1747/// The u64 base implementation is materially simpler than its u128
1748/// sibling because the doubled type (u128) is *native* — every step
1749/// that the u128 path does via `mul_128` + carry-merge is just one
1750/// `u128` op here.
1751#[derive(Clone, Copy)]
1752pub(crate) struct MG2by1U64 {
1753    d: u64,
1754    v: u64,
1755}
1756
1757impl MG2by1U64 {
1758    /// `d` must be normalised: `d >> 63 == 1`.
1759    #[inline]
1760    pub(crate) const fn new(d: u64) -> Self {
1761        debug_assert!(d >> 63 == 1, "MG2by1U64::new: divisor must be normalised");
1762        // v = floor((B² - 1 - d·B) / d) where B = 2^64.
1763        // Numerator high = !d (= B-1-d), low = u64::MAX (= B-1).
1764        // High < d for normalised d, so native u128/u128 with the
1765        // divisor cast to u128 returns a quotient fitting u64.
1766        let num = ((!d as u128) << 64) | (u64::MAX as u128);
1767        let v = (num / (d as u128)) as u64;
1768        Self { d, v }
1769    }
1770
1771    /// Divide `(u1·B + u0)` by `d`. Requires `u1 < d`.
1772    #[inline]
1773    pub(crate) const fn div_rem(&self, u1: u64, u0: u64) -> (u64, u64) {
1774        debug_assert!(u1 < self.d, "MG2by1U64::div_rem: high word must be < divisor");
1775        // (q1, q0) = v·u1 + ⟨u1, u0⟩ as u128
1776        let q128 = (self.v as u128).wrapping_mul(u1 as u128)
1777            .wrapping_add(((u1 as u128) << 64) | (u0 as u128));
1778        let mut q1 = (q128 >> 64) as u64;
1779        let q0 = q128 as u64;
1780        q1 = q1.wrapping_add(1);
1781        let mut r = u0.wrapping_sub(q1.wrapping_mul(self.d));
1782        if r > q0 {
1783            q1 = q1.wrapping_sub(1);
1784            r = r.wrapping_add(self.d);
1785        }
1786        if r >= self.d {
1787            q1 = q1.wrapping_add(1);
1788            r = r.wrapping_sub(self.d);
1789        }
1790        (q1, r)
1791    }
1792}
1793
1794/// Möller–Granlund 3-by-2 invariant divisor at u64 base.
1795///
1796/// Divides `(n2·B² + n1·B + n0)` by `(d1·B + d0)` for a normalised
1797/// 2-limb divisor (`d1`'s top bit set) using *two* limbs of divisor
1798/// information, returning a quotient that is exactly correct in one
1799/// pass — no refinement loop is needed in the Knuth Algorithm D
1800/// caller. Compared to [`MG2by1U64`] + the historic Knuth refinement
1801/// loop, the 3-by-2 form trades:
1802///
1803/// - +1 hardware multiply per call (the d0·q step), against
1804/// - up to 2 refinement-loop iterations per quotient limb saved.
1805///
1806/// Net win at every Knuth call with `n ≥ 2` divisor limbs.
1807///
1808/// Reference: Möller & Granlund 2011, Algorithm 5 (the divide) and
1809/// Algorithm 6 (the reciprocal precompute). `MG2by1U64` is the 2-by-1
1810/// cousin used by `limbs_divmod_knuth_u64`'s q̂ estimator.
1811#[derive(Clone, Copy)]
1812pub(crate) struct MG3by2U64 {
1813    d1: u64,
1814    d0: u64,
1815    /// Reciprocal of the top divisor limb (same formula as MG2by1U64::v).
1816    dinv: u64,
1817}
1818
1819impl MG3by2U64 {
1820    /// Setup. `d1` must be normalised (`d1 >> 63 == 1`).
1821    ///
1822    /// Computes the *3-by-2* invariant reciprocal, which differs from
1823    /// the [`MG2by1U64`] 2-by-1 reciprocal by an extra refinement step
1824    /// that accounts for `d0`. Without that refinement the algorithm
1825    /// fails on inputs where the divisor's low limb is large enough
1826    /// to push the q estimate over by more than the corrections can
1827    /// recover (test case: `n=(B-2, B-1, B-1)`, `d=(B-1, B-1)`, where
1828    /// the naive 2-by-1 reciprocal hands back q=0 instead of B-1).
1829    ///
1830    /// Reference: Möller & Granlund 2011, Algorithm 6 (the
1831    /// reciprocal refinement that accounts for `d0`).
1832    #[inline]
1833    pub(crate) const fn new(d1: u64, d0: u64) -> Self {
1834        debug_assert!(d1 >> 63 == 1, "MG3by2U64::new: top divisor limb must be normalised");
1835        // Step 1: 2-by-1 reciprocal of d1 alone.
1836        let num = ((!d1 as u128) << 64) | (u64::MAX as u128);
1837        let mut v = (num / (d1 as u128)) as u64;
1838
1839        // Step 2: refine for d0. `p = d1·v + d0` (mod B). If the sum
1840        // overflows, v was over-estimated → decrement.
1841        let mut p = d1.wrapping_mul(v).wrapping_add(d0);
1842        if p < d0 {
1843            v = v.wrapping_sub(1);
1844            let mask = if p >= d1 { u64::MAX } else { 0 };
1845            p = p.wrapping_sub(d1);
1846            v = v.wrapping_add(mask);
1847            p = p.wrapping_sub(mask & d1);
1848        }
1849
1850        // Step 3: account for d0·v. `(t1, t0) = d0·v`; check if
1851        // `p + t1` overflows; one or two more decrements may be
1852        // required.
1853        let prod = (d0 as u128) * (v as u128);
1854        let t1 = (prod >> 64) as u64;
1855        let t0 = prod as u64;
1856        let (new_p, carry) = p.overflowing_add(t1);
1857        let _p_final = new_p;
1858        if carry {
1859            v = v.wrapping_sub(1);
1860            if new_p >= d1 && (new_p > d1 || t0 >= d0) {
1861                v = v.wrapping_sub(1);
1862            }
1863        }
1864
1865        Self { d1, d0, dinv: v }
1866    }
1867
1868    /// Divide `(n2·B² + n1·B + n0)` by `(d1·B + d0)`. Requires
1869    /// `(n2, n1) < (d1, d0)` so the quotient fits a single u64.
1870    /// Returns `(q, r1, r0)` where the remainder is `r1·B + r0`.
1871    ///
1872    /// Algorithm decomposition (Möller & Granlund 2011, Algorithm 5):
1873    /// 1. Initial `q` from a 2-by-1 divide of `(n2, n1)` by `d1` via
1874    ///    the precomputed reciprocal `dinv`.
1875    /// 2. Subtract `(q·d1, q·d0)` from `(n1, n0)`; this stages the
1876    ///    candidate remainder `(r1, r0)`.
1877    /// 3. Two add-back corrections that fire 0–1 times each: the
1878    ///    first catches `q` over by 1, the second catches the rare
1879    ///    `q` over by 2 (only possible if the initial 2-by-1 reciprocal
1880    ///    over-shot).
1881    #[inline]
1882    pub(crate) const fn div_rem(&self, n2: u64, n1: u64, n0: u64) -> (u64, u64, u64) {
1883        debug_assert!(
1884            n2 < self.d1 || (n2 == self.d1 && n1 < self.d0),
1885            "MG3by2U64::div_rem: numerator high pair must be < divisor"
1886        );
1887
1888        // Step 1: q estimate from (n2, n1) / d1 via dinv.
1889        // (q_hi, q_lo) = n2 * dinv + (n2, n1) — overflow into a 257th
1890        // bit is fine, the mask-based correction (step 4a) recovers
1891        // from it without needing to materialise the lost bit.
1892        let prod = (n2 as u128).wrapping_mul(self.dinv as u128)
1893            .wrapping_add(((n2 as u128) << 64) | (n1 as u128));
1894        let mut q = (prod >> 64) as u64;
1895        let q_lo = prod as u64;
1896
1897        // Step 2a: r1 = n1 - q·d1 (mod B).
1898        let mut r1 = n1.wrapping_sub(q.wrapping_mul(self.d1));
1899
1900        // Step 2b: (r1, r0) = (r1, n0) - (d1, d0).
1901        let r256 = (((r1 as u128) << 64) | (n0 as u128))
1902            .wrapping_sub(((self.d1 as u128) << 64) | (self.d0 as u128));
1903        r1 = (r256 >> 64) as u64;
1904        let mut r0 = r256 as u64;
1905
1906        // Step 2c: (r1, r0) -= d0·q (mod B²).
1907        let t = (self.d0 as u128).wrapping_mul(q as u128);
1908        let r256 = (((r1 as u128) << 64) | (r0 as u128)).wrapping_sub(t);
1909        r1 = (r256 >> 64) as u64;
1910        r0 = r256 as u64;
1911
1912        // Step 3: q += 1; provisional.
1913        q = q.wrapping_add(1);
1914
1915        // Step 4a: first conditional correction.
1916        // If r1 >= q_lo (in u64 numeric), the provisional q was over
1917        // by 1; decrement q and add (d1, d0) back to the remainder.
1918        // Branchless via mask.
1919        let mask = if r1 >= q_lo { u64::MAX } else { 0 };
1920        q = q.wrapping_add(mask); // adds u64::MAX = -1.
1921        let add = ((mask & self.d1) as u128) << 64 | ((mask & self.d0) as u128);
1922        let r256 = (((r1 as u128) << 64) | (r0 as u128)).wrapping_add(add);
1923        r1 = (r256 >> 64) as u64;
1924        r0 = r256 as u64;
1925
1926        // Step 4b: final correction (rare).
1927        // If (r1, r0) >= (d1, d0), q was *still* off by 1; bump q and
1928        // subtract the divisor once more.
1929        if r1 > self.d1 || (r1 == self.d1 && r0 >= self.d0) {
1930            q = q.wrapping_add(1);
1931            let r256 = (((r1 as u128) << 64) | (r0 as u128))
1932                .wrapping_sub(((self.d1 as u128) << 64) | (self.d0 as u128));
1933            r1 = (r256 >> 64) as u64;
1934            r0 = r256 as u64;
1935        }
1936
1937        (q, r1, r0)
1938    }
1939}
1940
1941/// Runtime divide dispatcher at u64 base.
1942pub(crate) fn limbs_divmod_dispatch_u64(
1943    num: &[u64],
1944    den: &[u64],
1945    quot: &mut [u64],
1946    rem: &mut [u64],
1947) {
1948    const BZ_THRESHOLD_U64: usize = 16; // doubled from u128 path's 8.
1949
1950    let mut n = den.len();
1951    while n > 0 && den[n - 1] == 0 {
1952        n -= 1;
1953    }
1954    assert!(n > 0, "limbs_divmod_dispatch_u64: divide by zero");
1955
1956    let mut top = num.len();
1957    while top > 0 && num[top - 1] == 0 {
1958        top -= 1;
1959    }
1960
1961    // Single-limb divisor: defer to const limbs_divmod_u64 (its Fast B
1962    // is one hardware u128/u64 per dividend limb — already optimal).
1963    if n == 1 {
1964        limbs_divmod_u64(num, den, quot, rem);
1965        return;
1966    }
1967
1968    if n >= BZ_THRESHOLD_U64 && top >= 2 * n {
1969        limbs_divmod_bz_u64(num, den, quot, rem);
1970    } else {
1971        limbs_divmod_knuth_u64(num, den, quot, rem);
1972    }
1973}
1974
1975/// Knuth Algorithm D at base 2^64.
1976///
1977/// Same structure as [`limbs_divmod_knuth`] but every limb is a u64
1978/// and the q̂ estimator uses [`MG2by1U64`]. The multiply-subtract pass
1979/// uses native `u64 × u64 → u128`, which collapses one carry-merge
1980/// layer compared to the u128 version's `mul_128`.
1981pub(crate) fn limbs_divmod_knuth_u64(
1982    num: &[u64],
1983    den: &[u64],
1984    quot: &mut [u64],
1985    rem: &mut [u64],
1986) {
1987    for q in quot.iter_mut() {
1988        *q = 0;
1989    }
1990    for r in rem.iter_mut() {
1991        *r = 0;
1992    }
1993
1994    let mut n = den.len();
1995    while n > 0 && den[n - 1] == 0 {
1996        n -= 1;
1997    }
1998    assert!(n > 0, "limbs_divmod_knuth_u64: divide by zero");
1999
2000    let mut top = num.len();
2001    while top > 0 && num[top - 1] == 0 {
2002        top -= 1;
2003    }
2004    if top < n {
2005        let copy_n = num.len().min(rem.len());
2006        let mut i = 0;
2007        while i < copy_n {
2008            rem[i] = num[i];
2009            i += 1;
2010        }
2011        return;
2012    }
2013
2014    let shift = den[n - 1].leading_zeros();
2015    let mut u = [0u64; SCRATCH_LIMBS_U64];
2016    let mut v = [0u64; SCRATCH_LIMBS_U64];
2017    debug_assert!(top < SCRATCH_LIMBS_U64 && n <= SCRATCH_LIMBS_U64);
2018
2019    if shift == 0 {
2020        u[..top].copy_from_slice(&num[..top]);
2021        u[top] = 0;
2022        v[..n].copy_from_slice(&den[..n]);
2023    } else {
2024        let mut carry: u64 = 0;
2025        for i in 0..top {
2026            let val = num[i];
2027            u[i] = (val << shift) | carry;
2028            carry = val >> (64 - shift);
2029        }
2030        u[top] = carry;
2031        carry = 0;
2032        for i in 0..n {
2033            let val = den[i];
2034            v[i] = (val << shift) | carry;
2035            carry = val >> (64 - shift);
2036        }
2037    }
2038
2039    let m_plus_n = if u[top] != 0 { top + 1 } else { top };
2040    debug_assert!(m_plus_n >= n);
2041    let m = m_plus_n - n;
2042
2043    // Knuth Algorithm D requires a multi-limb divisor. Single-limb
2044    // divisors have a much faster hardware divide path; route them
2045    // out here so the hot loop below can assume n >= 2 and lose the
2046    // per-iteration n=1 dispatch.
2047    if n == 1 {
2048        limbs_divmod_u64(num, den, quot, rem);
2049        return;
2050    }
2051
2052    // MG 2-by-1 q̂ estimator (Möller-Granlund 2011 Algorithm 4) +
2053    // inner refinement against v[n-2]. The 3-by-2 estimator was
2054    // re-benched post u64 migration: its per-q̂ setup cost
2055    // (extra multiplies vs the 2-by-1's one) outweighs the
2056    // refinement loop's near-zero iteration count on decimal
2057    // divisors, so 2-by-1 + while-loop still wins at the widest
2058    // tiers.
2059    let v_top = v[n - 1];
2060    let v_below = v[n - 2];
2061    let mg_top = MG2by1U64::new(v_top);
2062
2063    let mut j_plus_one = m + 1;
2064    while j_plus_one > 0 {
2065        j_plus_one -= 1;
2066        let j = j_plus_one;
2067
2068        let jn = j + n;
2069        let u_top = u[jn];
2070        let u_next = u[jn - 1];
2071
2072        // MG 2-by-1 q̂ + (r_hat). Cheap-check `u_top > v_top` first
2073        // — strict greater → q̂ = MAX, no MG call, no refinement
2074        // possible — then handle the `u_top == v_top` edge inline,
2075        // then the common u_top < v_top case via the 2-by-1
2076        // estimator.
2077        let (mut q_hat, mut r_hat) = if u_top > v_top {
2078            (u64::MAX, u64::MAX)
2079        } else if u_top == v_top {
2080            let (r, of) = u_next.overflowing_add(v_top);
2081            (u64::MAX, if of { u64::MAX } else { r })
2082        } else {
2083            mg_top.div_rem(u_top, u_next)
2084        };
2085
2086        // Refinement against v[n-2]. Tightens q̂ from off-by-2 to
2087        // off-by-1; the post-subtract `final_borrow` check below
2088        // catches the remaining off-by-1 case. Almost never fires
2089        // on decimal divisors but cheap when it doesn't.
2090        loop {
2091            let prod = (q_hat as u128) * (v_below as u128);
2092            let hi = (prod >> 64) as u64;
2093            let lo = prod as u64;
2094            let rhs_lo = u[jn - 2];
2095            let rhs_hi = r_hat;
2096            if hi < rhs_hi || (hi == rhs_hi && lo <= rhs_lo) {
2097                break;
2098            }
2099            q_hat = q_hat.wrapping_sub(1);
2100            let (new_r, of) = r_hat.overflowing_add(v_top);
2101            if of {
2102                break;
2103            }
2104            r_hat = new_r;
2105        }
2106
2107        // D4. u[j..=j+n] -= q̂ · v[0..n]
2108        let mut mul_carry: u64 = 0;
2109        let mut borrow: u64 = 0;
2110        for i in 0..n {
2111            let prod = (q_hat as u128) * (v[i] as u128);
2112            let prod_lo = prod as u64;
2113            let prod_hi = (prod >> 64) as u64;
2114            let (s_prod, c1) = prod_lo.overflowing_add(mul_carry);
2115            let new_mul_carry = prod_hi + (c1 as u64);
2116            let (s1, b1) = u[j + i].overflowing_sub(s_prod);
2117            let (s2, b2) = s1.overflowing_sub(borrow);
2118            u[j + i] = s2;
2119            borrow = (b1 as u64) + (b2 as u64);
2120            mul_carry = new_mul_carry;
2121        }
2122        let (s1, b1) = u[j + n].overflowing_sub(mul_carry);
2123        let (s2, b2) = s1.overflowing_sub(borrow);
2124        u[j + n] = s2;
2125        let final_borrow = (b1 as u64) + (b2 as u64);
2126
2127        if final_borrow != 0 {
2128            q_hat = q_hat.wrapping_sub(1);
2129            let mut carry: u64 = 0;
2130            for i in 0..n {
2131                let (s1, c1) = u[j + i].overflowing_add(v[i]);
2132                let (s2, c2) = s1.overflowing_add(carry);
2133                u[j + i] = s2;
2134                carry = (c1 as u64) + (c2 as u64);
2135            }
2136            u[j + n] = u[j + n].wrapping_add(carry);
2137        }
2138
2139        if j < quot.len() {
2140            quot[j] = q_hat;
2141        }
2142    }
2143
2144    if shift == 0 {
2145        let copy_n = n.min(rem.len());
2146        rem[..copy_n].copy_from_slice(&u[..copy_n]);
2147    } else {
2148        for i in 0..n {
2149            if i < rem.len() {
2150                let lo = u[i] >> shift;
2151                let hi_into_lo = if i + 1 < n {
2152                    u[i + 1] << (64 - shift)
2153                } else {
2154                    0
2155                };
2156                rem[i] = lo | hi_into_lo;
2157            }
2158        }
2159    }
2160}
2161
2162/// Burnikel–Ziegler outer chunking, u64 base.
2163pub(crate) fn limbs_divmod_bz_u64(
2164    num: &[u64],
2165    den: &[u64],
2166    quot: &mut [u64],
2167    rem: &mut [u64],
2168) {
2169    const BZ_THRESHOLD_U64: usize = 16;
2170
2171    let mut n = den.len();
2172    while n > 0 && den[n - 1] == 0 {
2173        n -= 1;
2174    }
2175    assert!(n > 0, "limbs_divmod_bz_u64: divide by zero");
2176
2177    let mut top = num.len();
2178    while top > 0 && num[top - 1] == 0 {
2179        top -= 1;
2180    }
2181
2182    if n < BZ_THRESHOLD_U64 || top < 2 * n {
2183        limbs_divmod_knuth_u64(num, den, quot, rem);
2184        return;
2185    }
2186
2187    for q in quot.iter_mut() {
2188        *q = 0;
2189    }
2190    for r in rem.iter_mut() {
2191        *r = 0;
2192    }
2193
2194    let chunks = top.div_ceil(n);
2195    let mut carry = [0u64; SCRATCH_LIMBS_U64];
2196    let mut buf = [0u64; SCRATCH_LIMBS_U64];
2197    let mut q_chunk = [0u64; SCRATCH_LIMBS_U64];
2198    let mut r_chunk = [0u64; SCRATCH_LIMBS_U64];
2199
2200    let mut idx = chunks;
2201    while idx > 0 {
2202        idx -= 1;
2203        let lo = idx * n;
2204        let hi = ((idx + 1) * n).min(top);
2205        buf.fill(0);
2206        let chunk_len = hi - lo;
2207        buf[..chunk_len].copy_from_slice(&num[lo..lo + chunk_len]);
2208        buf[chunk_len..chunk_len + n].copy_from_slice(&carry[..n]);
2209        let buf_len = chunk_len + n;
2210        limbs_divmod_knuth_u64(
2211            &buf[..buf_len],
2212            &den[..n],
2213            &mut q_chunk[..buf_len],
2214            &mut r_chunk[..n],
2215        );
2216        let store_end = (lo + n).min(quot.len());
2217        let store_len = store_end.saturating_sub(lo);
2218        quot[lo..lo + store_len].copy_from_slice(&q_chunk[..store_len]);
2219        carry[..n].copy_from_slice(&r_chunk[..n]);
2220    }
2221    let rem_n = n.min(rem.len());
2222    rem[..rem_n].copy_from_slice(&carry[..rem_n]);
2223}
2224
2225/// `out = floor(sqrt(n))`. Newton iteration on top of the runtime
2226/// divide dispatcher.
2227///
2228/// History: this routine previously called the *const* [`limbs_divmod_u64`]
2229/// per iteration, which routes multi-limb divisors through the
2230/// O(bits²) shift-subtract path. At Int4096 (n=64 u64 limbs) that
2231/// dominates wall time — Newton converges in ~log₂(b) ≈ 12 iterations,
2232/// each one a `~65k`-limb-op divmod. Switching to
2233/// [`limbs_divmod_dispatch_u64`] gets Knuth-base-2⁶⁴ per iteration
2234/// (~`~32²` = 1024 limb-ops), worth ~40× on D307 sqrt.
2235pub(crate) fn limbs_isqrt_u64(n: &[u64], out: &mut [u64]) {
2236    for o in out.iter_mut() {
2237        *o = 0;
2238    }
2239    let bits = limbs_bit_len_u64(n);
2240    if bits == 0 {
2241        return;
2242    }
2243    if bits <= 1 {
2244        out[0] = 1;
2245        return;
2246    }
2247    let work = n.len() + 1;
2248    debug_assert!(work <= SCRATCH_LIMBS_U64, "isqrt scratch overflow");
2249    let mut x = [0u64; SCRATCH_LIMBS_U64];
2250
2251    // Initial guess. The classical seed is a single bit at position
2252    // `ceil(bits/2)` — one bit of accuracy, costing one Newton step per
2253    // doubling of accuracy (≈ `log2(bits/2)` iterations at any width).
2254    //
2255    // The hardware-`f64::sqrt` seed below lifts that to ~53 correct
2256    // bits in one go: extract the top 64 bits of `n` (which fits the
2257    // f64 mantissa with 11 bits of headroom), take the hardware sqrt,
2258    // and shift the result back to the correct magnitude. For Int512
2259    // (D76 sqrt input) this drops the Newton iteration count from ~8
2260    // to ~3, with each saved iteration eliminating one full
2261    // `limbs_divmod_dispatch_u64` call (the dominant cost).
2262    //
2263    // Hasselgren's trick — see Crandall & Pomerance 2005, "Prime
2264    // Numbers: A Computational Perspective" §9.2.1 — credits the
2265    // f64-bootstrap idea to T. Hasselgren in the GMP mailing list
2266    // archives; the implementation here is a from-first-principles
2267    // limb-array variant.
2268    if bits >= 8 {
2269        // Extract top 64 bits of `n` as a u64, aligned so the leading
2270        // 1 sits at position 63 (or as close as `n` allows).
2271        let shift = bits - 64.min(bits);
2272        // shift == 0 happens iff bits < 64 → n fits one limb.
2273        // Otherwise top_u64 = (n >> shift) & ((1<<64)-1).
2274        let limb_idx = (shift / 64) as usize;
2275        let bit_off = shift % 64;
2276        let top_u64: u64 = if bit_off == 0 {
2277            n[limb_idx]
2278        } else {
2279            let lo = n[limb_idx] >> bit_off;
2280            let hi = if limb_idx + 1 < n.len() {
2281                n[limb_idx + 1].checked_shl(64 - bit_off).unwrap_or(0)
2282            } else {
2283                0
2284            };
2285            lo | hi
2286        };
2287        // Hardware sqrt on the top 64 bits. f64 mantissa carries 53
2288        // bits; the low 11 of top_u64 are lost — accepted, the Newton
2289        // loop refines the seed to full precision.
2290        let seed_f64 = (top_u64 as f64).sqrt();
2291        // True sqrt(n) = sqrt(top * 2^shift) = seed_f64 * 2^(shift / 2).
2292        // If shift is odd we add a √2 factor:
2293        // seed_f64 * √2 * 2^((shift - 1) / 2).
2294        let (seed_f64, half_shift) = if (shift & 1) == 1 {
2295            (seed_f64 * core::f64::consts::SQRT_2, (shift - 1) / 2)
2296        } else {
2297            (seed_f64, shift / 2)
2298        };
2299        // Convert to an integer over-estimate. f64's sqrt is
2300        // correctly rounded but the `as u128` cast truncates toward
2301        // zero; ceil and add a 1-ULP safety margin so the placed
2302        // seed is a strict over-estimate of the true sqrt. Newton's
2303        // iteration only converges monotonically down to
2304        // floor(sqrt(n)) from a strict over-estimate; under-shoot
2305        // would cause the iteration to oscillate and the
2306        // `y >= x` exit check to terminate on the wrong side.
2307        // `core::f64::ceil` is std-only but we depend on std elsewhere
2308        // (the `as u128` cast already brings the libm float→int
2309        // conversion in). Use a no_std-safe manual ceil:
2310        // `(x as u128) + (if x.fract() != 0.0 { 1 } else { 0 })`.
2311        let truncated = seed_f64 as u128;
2312        let frac_nonzero = (truncated as f64) != seed_f64;
2313        let seed_int: u128 = truncated
2314            .saturating_add(if frac_nonzero { 1 } else { 0 })
2315            .saturating_add(1);
2316        // Place seed_int at bit position `half_shift` in x. seed_int
2317        // is at most ~53 bits set (the f64 mantissa) + 2, so the
2318        // shifted value occupies at most 2 u64 limbs.
2319        let seed_limb_idx = (half_shift / 64) as usize;
2320        let seed_bit_off = half_shift % 64;
2321        let shifted: u128 = seed_int << seed_bit_off;
2322        let seed_lo = shifted as u64;
2323        let seed_hi = (shifted >> 64) as u64;
2324        if seed_limb_idx < work {
2325            x[seed_limb_idx] |= seed_lo;
2326        }
2327        if seed_limb_idx + 1 < work {
2328            x[seed_limb_idx + 1] |= seed_hi;
2329        }
2330        // Newton needs a non-zero divisor. Empty seed (would only
2331        // happen on a tiny input — `bits >= 8` rules most of those
2332        // out, but defend the invariant anyway).
2333        if limbs_is_zero_u64(&x[..work]) {
2334            x[0] = 1;
2335        }
2336    } else {
2337        // Tiny n: fall back to the classical 1-bit seed.
2338        let e = bits.div_ceil(2);
2339        x[(e / 64) as usize] |= 1u64 << (e % 64);
2340    }
2341
2342    loop {
2343        let mut q = [0u64; SCRATCH_LIMBS_U64];
2344        let mut r = [0u64; SCRATCH_LIMBS_U64];
2345        limbs_divmod_dispatch_u64(n, &x[..work], &mut q[..work], &mut r[..work]);
2346        limbs_add_assign_u64(&mut q[..work], &x[..work]);
2347        let mut y = [0u64; SCRATCH_LIMBS_U64];
2348        limbs_shr_u64(&q[..work], 1, &mut y[..work]);
2349        if limbs_cmp_u64(&y[..work], &x[..work]) >= 0 {
2350            break;
2351        }
2352        x = y;
2353    }
2354    let copy_len = if out.len() < work { out.len() } else { work };
2355    out[..copy_len].copy_from_slice(&x[..copy_len]);
2356}
2357
2358/// `limbs /= radix` in place, returning the remainder. `radix` must
2359/// be a u64 (so the per-limb divide stays inside `u128 / u64`).
2360fn limbs_div_small_u64(limbs: &mut [u64], radix: u64) -> u64 {
2361    let mut rem: u64 = 0;
2362    for limb in limbs.iter_mut().rev() {
2363        let acc = ((rem as u128) << 64) | (*limb as u128);
2364        *limb = (acc / (radix as u128)) as u64;
2365        rem = (acc % (radix as u128)) as u64;
2366    }
2367    rem
2368}
2369
2370/// Format a u64 limb slice into `buf` in the given radix (`2..=16`).
2371pub(crate) fn limbs_fmt_into_u64<'a>(
2372    limbs: &[u64],
2373    radix: u64,
2374    lower: bool,
2375    buf: &'a mut [u8],
2376) -> &'a str {
2377    let digits: &[u8] = if lower {
2378        b"0123456789abcdef"
2379    } else {
2380        b"0123456789ABCDEF"
2381    };
2382    if limbs_is_zero_u64(limbs) {
2383        let last = buf.len() - 1;
2384        buf[last] = b'0';
2385        return core::str::from_utf8(&buf[last..]).unwrap();
2386    }
2387    let mut work = [0u64; SCRATCH_LIMBS_U64];
2388    work[..limbs.len()].copy_from_slice(limbs);
2389    let wl = limbs.len();
2390    let mut pos = buf.len();
2391    while !limbs_is_zero_u64(&work[..wl]) {
2392        let r = limbs_div_small_u64(&mut work[..wl], radix);
2393        pos -= 1;
2394        buf[pos] = digits[r as usize];
2395    }
2396    core::str::from_utf8(&buf[pos..]).unwrap()
2397}
2398
2399/// Signed three-way compare for u64-limb magnitudes with signs.
2400#[inline]
2401pub(crate) const fn scmp_u64(a_neg: bool, a: &[u64], b_neg: bool, b: &[u64]) -> i32 {
2402    match (a_neg, b_neg) {
2403        (true, false) => -1,
2404        (false, true) => 1,
2405        _ => limbs_cmp_u64(a, b),
2406    }
2407}
2408
2409// ─────────────────────────────────────────────────────────────────────
2410// End of u64 primitives.
2411// ─────────────────────────────────────────────────────────────────────
2412
2413mod macros;
2414use macros::decl_wide_int;
2415
2416
2417/// Signed three-way comparison of two magnitude-limb slices given their
2418/// signs. Returns `-1` / `0` / `1`.
2419#[inline]
2420pub(crate) const fn scmp(a_neg: bool, a: &[u128], b_neg: bool, b: &[u128]) -> i32 {
2421    match (a_neg, b_neg) {
2422        (true, false) => -1,
2423        (false, true) => 1,
2424        _ => limbs_cmp(a, b),
2425    }
2426}
2427
2428/// A signed integer that can be decomposed into a magnitude + sign and
2429/// rebuilt from one — the basis of `wide_cast` (widen / narrow between
2430/// any two widths, or between a wide integer and a primitive
2431/// `i128` / `i64` / `u128`). u64-limb representation throughout, so
2432/// every wide-int's magnitude buffer is directly compatible without
2433/// boundary conversion.
2434pub(crate) trait WideInt: Copy {
2435    /// Magnitude limbs (little-endian u64, zero-padded to 288) and sign.
2436    /// 288 u64 limbs = 18432 bits, covers Int16384 + slack.
2437    fn to_mag_sign(self) -> ([u64; 288], bool);
2438    /// Rebuilds from a magnitude limb slice and a sign, truncating
2439    /// the magnitude to this type's width.
2440    fn from_mag_sign(mag: &[u64], negative: bool) -> Self;
2441
2442    /// Writes the magnitude into a caller-supplied u128 limb buffer
2443    /// (little-endian, `dst[0]` least significant) and returns the
2444    /// sign. Implementations zero-pad `dst` to its full length. Used
2445    /// by hot-path callers (`mg_divide::div_wide_pow10_with`,
2446    /// `wide_transcendental` rescales) to avoid the 2.3 kB
2447    /// stack-allocated buffer that the default
2448    /// `to_mag_sign` → repack chain produces.
2449    ///
2450    /// Default impl wraps `to_mag_sign`; concrete `Int*` types
2451    /// override with a direct limb copy that only touches their
2452    /// `L / 2` real limbs.
2453    #[inline]
2454    fn mag_into_u128(self, dst: &mut [u128]) -> bool {
2455        let (mag, neg) = self.to_mag_sign();
2456        let n_pairs = (288 / 2).min(dst.len());
2457        let mut i = 0;
2458        while i < n_pairs {
2459            dst[i] = (mag[2 * i] as u128) | ((mag[2 * i + 1] as u128) << 64);
2460            i += 1;
2461        }
2462        while i < dst.len() {
2463            dst[i] = 0;
2464            i += 1;
2465        }
2466        neg
2467    }
2468
2469    /// Rebuilds `Self` from a u128-limb magnitude and a sign. Default
2470    /// impl unpacks each u128 into a pair of u64 limbs and routes
2471    /// through `from_mag_sign`; concrete `Int*` types override with a
2472    /// direct copy bounded by their `L / 2` real limbs (skipping the
2473    /// 288-limb u64 staging buffer entirely).
2474    #[inline]
2475    fn from_mag_sign_u128(mag: &[u128], negative: bool) -> Self {
2476        let mut out = [0u64; 288];
2477        let n_pairs = mag.len().min(288 / 2);
2478        let mut i = 0;
2479        while i < n_pairs {
2480            out[2 * i] = mag[i] as u64;
2481            out[2 * i + 1] = (mag[i] >> 64) as u64;
2482            i += 1;
2483        }
2484        Self::from_mag_sign(&out, negative)
2485    }
2486}
2487
2488/// Implements `WideInt` for a signed primitive integer. The
2489/// magnitude is split into two u64 limbs (low, high) regardless of
2490/// the source primitive's width — for `i64` the high limb is always
2491/// zero; for `i128` the split carries the upper 64 bits.
2492macro_rules! impl_wideint_signed_prim {
2493    ($($t:ty),*) => {$(
2494        impl WideInt for $t {
2495            #[inline]
2496            fn to_mag_sign(self) -> ([u64; 288], bool) {
2497                let mut out = [0u64; 288];
2498                let mag = self.unsigned_abs() as u128;
2499                out[0] = mag as u64;
2500                out[1] = (mag >> 64) as u64;
2501                (out, self < 0)
2502            }
2503            #[inline]
2504            fn from_mag_sign(mag: &[u64], negative: bool) -> $t {
2505                let lo = mag.first().copied().unwrap_or(0) as u128;
2506                let hi = mag.get(1).copied().unwrap_or(0) as u128;
2507                let combined = lo | (hi << 64);
2508                let m = combined as $t;
2509                if negative { m.wrapping_neg() } else { m }
2510            }
2511        }
2512    )*};
2513}
2514impl_wideint_signed_prim!(i8, i16, i32, i64, i128);
2515
2516impl WideInt for u128 {
2517    #[inline]
2518    fn to_mag_sign(self) -> ([u64; 288], bool) {
2519        let mut out = [0u64; 288];
2520        out[0] = self as u64;
2521        out[1] = (self >> 64) as u64;
2522        (out, false)
2523    }
2524    #[inline]
2525    fn from_mag_sign(mag: &[u64], _negative: bool) -> u128 {
2526        let lo = mag.first().copied().unwrap_or(0) as u128;
2527        let hi = mag.get(1).copied().unwrap_or(0) as u128;
2528        lo | (hi << 64)
2529    }
2530}
2531
2532/// Widening / narrowing cast between any two `WideInt` types via a
2533/// shared magnitude + sign representation.
2534#[inline]
2535pub(crate) fn wide_cast<S: WideInt, T: WideInt>(src: S) -> T {
2536    let (mag, negative) = src.to_mag_sign();
2537    T::from_mag_sign(&mag, negative)
2538}
2539
2540/// Common interface across the wide signed integer family
2541/// (`Int192` … `Int16384`) — the operations the wide-tier algorithm
2542/// kernels (sqrt / cbrt / ln / exp / trig …) need to operate
2543/// generically over their storage / work types.
2544///
2545/// Implemented only for the wide signed integers; the primitive
2546/// integer kernels live in `wide_int` separately.
2547pub(crate) trait WideStorage:
2548    Copy
2549    + PartialEq
2550    + PartialOrd
2551    + WideInt
2552    + ::core::ops::Add<Output = Self>
2553    + ::core::ops::Sub<Output = Self>
2554    + ::core::ops::Mul<Output = Self>
2555    + ::core::ops::Div<Output = Self>
2556    + ::core::ops::Rem<Output = Self>
2557    + ::core::ops::Neg<Output = Self>
2558    + ::core::ops::Shl<u32, Output = Self>
2559    + ::core::ops::Shr<u32, Output = Self>
2560{
2561    /// Width of this storage type in bits (`L * 64`).
2562    const BITS: u32;
2563    /// Additive identity.
2564    const ZERO: Self;
2565    /// Multiplicative identity.
2566    const ONE: Self;
2567    /// Integer constant `10`, used by the decimal-scale `10^scale`
2568    /// rescaling that every kernel performs.
2569    const TEN: Self;
2570
2571    /// Integer power: `self^exp` via right-to-left binary exponentiation.
2572    fn pow(self, exp: u32) -> Self;
2573    /// Exact integer square root.
2574    fn isqrt(self) -> Self;
2575    /// Widening / narrowing cast to a sibling wide-storage type.
2576    fn resize_to<T: WideStorage>(self) -> T;
2577    /// Leading-zero count of the two's-complement representation.
2578    fn leading_zeros(self) -> u32;
2579}
2580
2581// The concrete wide integer type pairs. The 256/512/1024-bit widths
2582// back the wide decimal tiers; 2048/4096-bit widths are the
2583// strict-transcendental work integers.
2584// $L = u64 limb count; $D = doubled (= 2 · $L) for widening
2585// products. Each Int*'s name encodes the bit width = $L · 64.
2586decl_wide_int!(Uint192, Int192, 3, 6, 4);
2587decl_wide_int!(Uint256, Int256, 4, 8, 5);
2588decl_wide_int!(Uint384, Int384, 6, 12, 7);
2589decl_wide_int!(Uint512, Int512, 8, 16, 9);
2590decl_wide_int!(Uint768, Int768, 12, 24, 13);
2591decl_wide_int!(Uint1024, Int1024, 16, 32, 17);
2592decl_wide_int!(Uint1536, Int1536, 24, 48, 25);
2593decl_wide_int!(Uint2048, Int2048, 32, 64, 33);
2594decl_wide_int!(Uint3072, Int3072, 48, 96, 49);
2595decl_wide_int!(Uint4096, Int4096, 64, 128, 65);
2596decl_wide_int!(Uint6144, Int6144, 96, 192, 97);
2597decl_wide_int!(Uint8192, Int8192, 128, 256, 129);
2598decl_wide_int!(Uint12288, Int12288, 192, 384, 193);
2599decl_wide_int!(Uint16384, Int16384, 256, 512, 257);
2600
2601/// Implements `WideStorage` for one signed wide integer type, by
2602/// delegating each method to the inherent items the
2603/// `decl_wide_int!` macro already emits.
2604macro_rules! impl_wide_storage {
2605    ($($S:ty),* $(,)?) => {$(
2606        impl WideStorage for $S {
2607            const BITS: u32 = <$S>::BITS;
2608            const ZERO: Self = <$S>::ZERO;
2609            const ONE: Self = <$S>::ONE;
2610            const TEN: Self = <$S>::from_i128(10);
2611
2612            #[inline]
2613            fn pow(self, exp: u32) -> Self {
2614                <$S>::pow(self, exp)
2615            }
2616            #[inline]
2617            fn isqrt(self) -> Self {
2618                <$S>::isqrt(self)
2619            }
2620            #[inline]
2621            fn resize_to<T: WideStorage>(self) -> T {
2622                // The inherent `resize` is bounded by `WideInt`, and
2623                // `WideStorage: WideInt` so `T: WideInt` holds.
2624                <$S>::resize::<T>(self)
2625            }
2626            #[inline]
2627            fn leading_zeros(self) -> u32 {
2628                <$S>::leading_zeros(self)
2629            }
2630        }
2631    )*};
2632}
2633
2634impl_wide_storage!(
2635    Int192, Int256, Int384, Int512, Int768, Int1024, Int1536, Int2048,
2636    Int3072, Int4096, Int6144, Int8192, Int12288, Int16384,
2637);
2638
2639// Short aliases used by the decimal-tier macros (replacing the former
2640// `crate::wide` re-export shim). The signed alias is exposed at each
2641// width where it backs storage *or* serves as the next-width mul/div
2642// widening step; the unsigned alias only where `Display`'s magnitude
2643// path needs it. The feature gates mirror the call-site features.
2644// Tier aliases — each width gets an `I*` (signed) alias when it
2645// backs storage or serves as a mul/div widening step for some tier,
2646// and a matching `U*` (unsigned) when `Display`'s magnitude path
2647// needs it.
2648#[cfg(any(feature = "d57", feature = "wide"))]
2649pub(crate) use self::{Int192 as I192, Uint192 as U192};
2650#[cfg(any(feature = "d57", feature = "d76", feature = "wide"))]
2651pub(crate) use self::Int384 as I384;
2652#[cfg(any(feature = "d115", feature = "wide"))]
2653pub(crate) use self::Uint384 as U384;
2654#[cfg(any(feature = "d76", feature = "wide"))]
2655pub(crate) use self::{Int256 as I256, Uint256 as U256};
2656#[cfg(any(feature = "d76", feature = "d115", feature = "d153", feature = "wide"))]
2657pub(crate) use self::Int512 as I512;
2658#[cfg(any(feature = "d153", feature = "wide"))]
2659pub(crate) use self::Uint512 as U512;
2660#[cfg(any(feature = "d115", feature = "d153", feature = "d230", feature = "wide"))]
2661pub(crate) use self::Int768 as I768;
2662#[cfg(any(feature = "d230", feature = "wide"))]
2663pub(crate) use self::Uint768 as U768;
2664#[cfg(any(feature = "d153", feature = "d230", feature = "d307", feature = "wide", feature = "x-wide"))]
2665pub(crate) use self::Int1024 as I1024;
2666#[cfg(any(feature = "d230", feature = "d307", feature = "d462", feature = "wide", feature = "x-wide"))]
2667pub(crate) use self::Int1536 as I1536;
2668#[cfg(any(feature = "d462", feature = "x-wide"))]
2669pub(crate) use self::Uint1536 as U1536;
2670#[cfg(any(feature = "d307", feature = "d462", feature = "d616", feature = "wide", feature = "x-wide"))]
2671pub(crate) use self::{Int2048 as I2048, Uint1024 as U1024};
2672#[cfg(any(feature = "d616", feature = "x-wide"))]
2673pub(crate) use self::Uint2048 as U2048;
2674#[cfg(any(feature = "d462", feature = "d616", feature = "d924", feature = "x-wide", feature = "xx-wide"))]
2675pub(crate) use self::Int3072 as I3072;
2676#[cfg(any(feature = "d924", feature = "xx-wide"))]
2677pub(crate) use self::Uint3072 as U3072;
2678#[cfg(any(feature = "d616", feature = "d924", feature = "d1232", feature = "x-wide", feature = "xx-wide"))]
2679pub(crate) use self::Int4096 as I4096;
2680#[cfg(any(feature = "d1232", feature = "xx-wide"))]
2681pub(crate) use self::Uint4096 as U4096;
2682#[cfg(any(feature = "d924", feature = "d1232", feature = "xx-wide"))]
2683pub(crate) use self::Int6144 as I6144;
2684#[cfg(any(feature = "d1232", feature = "xx-wide"))]
2685pub(crate) use self::Int8192 as I8192;
2686#[cfg(any(feature = "d924", feature = "xx-wide"))]
2687#[allow(unused_imports)]
2688pub(crate) use self::Int12288 as I12288;
2689#[cfg(any(feature = "d1232", feature = "xx-wide"))]
2690#[allow(unused_imports)]
2691pub(crate) use self::Int16384 as I16384;
2692
2693#[cfg(test)]
2694mod karatsuba_tests {
2695    use super::*;
2696
2697    /// Karatsuba and schoolbook must agree bit-for-bit on every
2698    /// equal-length input that meets the threshold.
2699    #[test]
2700    fn karatsuba_matches_schoolbook_at_n16() {
2701        let a: [u128; 16] = core::array::from_fn(|i| (i as u128) * 0xdead_beef + 1);
2702        let b: [u128; 16] = core::array::from_fn(|i| 0xcafe_babe ^ ((i as u128) << 5));
2703        let mut s = [0u128; 32];
2704        let mut k = [0u128; 32];
2705        limbs_mul(&a, &b, &mut s);
2706        limbs_mul_karatsuba(&a, &b, &mut k);
2707        assert_eq!(s, k);
2708    }
2709
2710    #[test]
2711    fn karatsuba_matches_schoolbook_at_n32() {
2712        let a: [u128; 32] = core::array::from_fn(|i| (i as u128).wrapping_mul(0x1234_5678_9abc));
2713        let b: [u128; 32] = core::array::from_fn(|i| (i as u128 + 1).wrapping_mul(0xfedc_ba98));
2714        let mut s = [0u128; 64];
2715        let mut k = [0u128; 64];
2716        limbs_mul(&a, &b, &mut s);
2717        limbs_mul_karatsuba(&a, &b, &mut k);
2718        assert_eq!(s, k);
2719    }
2720
2721    #[test]
2722    fn karatsuba_handles_zero_inputs() {
2723        let a = [0u128; 16];
2724        let b: [u128; 16] = core::array::from_fn(|i| (i as u128) + 1);
2725        let mut k = [0u128; 32];
2726        limbs_mul_karatsuba(&a, &b, &mut k);
2727        for o in &k {
2728            assert_eq!(*o, 0);
2729        }
2730    }
2731}
2732
2733#[cfg(test)]
2734mod hint_tests {
2735    use super::*;
2736
2737    #[test]
2738    fn signed_add_sub_neg() {
2739        let a = Int256::from_i128(5);
2740        let b = Int256::from_i128(3);
2741        assert_eq!(a.wrapping_add(b), Int256::from_i128(8));
2742        assert_eq!(a.wrapping_sub(b), Int256::from_i128(2));
2743        assert_eq!(b.wrapping_sub(a), Int256::from_i128(-2));
2744        assert_eq!(a.negate(), Int256::from_i128(-5));
2745        assert_eq!(Int256::ZERO.negate(), Int256::ZERO);
2746    }
2747
2748    #[test]
2749    fn signed_mul_div_rem() {
2750        let six = Int512::from_i128(6);
2751        let two = Int512::from_i128(2);
2752        let three = Int512::from_i128(3);
2753        assert_eq!(six.wrapping_mul(three), Int512::from_i128(18));
2754        assert_eq!(six.wrapping_div(two), three);
2755        assert_eq!(Int512::from_i128(7).wrapping_rem(three), Int512::from_i128(1));
2756        assert_eq!(Int512::from_i128(-7).wrapping_rem(three), Int512::from_i128(-1));
2757        assert_eq!(six.negate().wrapping_mul(three), Int512::from_i128(-18));
2758    }
2759
2760    #[test]
2761    fn checked_overflow() {
2762        assert_eq!(Int256::MAX.checked_add(Int256::ONE), None);
2763        assert_eq!(Int256::MIN.checked_sub(Int256::ONE), None);
2764        assert_eq!(Int256::MIN.checked_neg(), None);
2765        assert_eq!(
2766            Int256::from_i128(2).checked_add(Int256::from_i128(3)),
2767            Some(Int256::from_i128(5))
2768        );
2769    }
2770
2771    #[test]
2772    fn from_str_and_pow() {
2773        let ten = Int1024::from_str_radix("10", 10).unwrap();
2774        assert_eq!(ten, Int1024::from_i128(10));
2775        assert_eq!(ten.pow(3), Int1024::from_i128(1000));
2776        let big = Int1024::from_str_radix("10", 10).unwrap().pow(40);
2777        let from_str = Int1024::from_str_radix(
2778            "10000000000000000000000000000000000000000",
2779            10,
2780        )
2781        .unwrap();
2782        assert_eq!(big, from_str);
2783        assert_eq!(Int256::from_str_radix("-42", 10).unwrap(), Int256::from_i128(-42));
2784    }
2785
2786    #[test]
2787    fn ordering_and_resize() {
2788        assert!(Int256::from_i128(-1) < Int256::ZERO);
2789        assert!(Int256::MIN < Int256::MAX);
2790        let v = Int256::from_i128(-123_456_789);
2791        let wide: Int1024 = v.resize();
2792        let back: Int256 = wide.resize();
2793        assert_eq!(back, v);
2794        assert_eq!(wide, Int1024::from_i128(-123_456_789));
2795    }
2796
2797    #[test]
2798    fn isqrt_and_f64() {
2799        assert_eq!(Int512::from_i128(144).isqrt(), Int512::from_i128(12));
2800        assert_eq!(Int256::from_i128(1_000_000).as_f64(), 1_000_000.0);
2801        assert_eq!(Int256::from_f64(-2_500.0), Int256::from_i128(-2500));
2802    }
2803
2804    /// `Uint256` (the unsigned macro emission) supports the same
2805    /// bit/sign-manipulation surface as the signed sibling. Methods
2806    /// here are reachable through the wide decimal types but not always
2807    /// exercised by name; verify the contracts directly.
2808    #[test]
2809    fn uint256_is_zero_and_bit_helpers() {
2810        let zero = Uint256::ZERO;
2811        let one = Uint256::from_str_radix("1", 10).unwrap();
2812        let two = Uint256::from_str_radix("2", 10).unwrap();
2813        assert!(zero.is_zero());
2814        assert!(!one.is_zero());
2815        assert!(one.is_power_of_two());
2816        assert!(two.is_power_of_two());
2817        let three = Uint256::from_str_radix("3", 10).unwrap();
2818        assert!(!three.is_power_of_two());
2819        // next_power_of_two(0) == 1
2820        assert_eq!(zero.next_power_of_two(), one);
2821        // next_power_of_two(1) == 1 (already power of two)
2822        assert_eq!(one.next_power_of_two(), one);
2823        // next_power_of_two(3) == 4
2824        let four = Uint256::from_str_radix("4", 10).unwrap();
2825        assert_eq!(three.next_power_of_two(), four);
2826        // count_ones / leading_zeros
2827        assert_eq!(zero.count_ones(), 0);
2828        assert_eq!(one.count_ones(), 1);
2829        assert_eq!(zero.leading_zeros(), Uint256::BITS);
2830        assert_eq!(one.leading_zeros(), Uint256::BITS - 1);
2831    }
2832
2833    #[test]
2834    fn uint256_parse_arithmetic_and_pow() {
2835        // from_str_radix only accepts radix 10.
2836        assert!(Uint256::from_str_radix("10", 2).is_err());
2837        // Non-digit byte rejected.
2838        assert!(Uint256::from_str_radix("1a", 10).is_err());
2839        // Arithmetic: 3 - 2 = 1, 6 / 2 = 3, 7 % 3 = 1, 3·3 = 9.
2840        let two = Uint256::from_str_radix("2", 10).unwrap();
2841        let three = Uint256::from_str_radix("3", 10).unwrap();
2842        let six = Uint256::from_str_radix("6", 10).unwrap();
2843        let seven = Uint256::from_str_radix("7", 10).unwrap();
2844        assert_eq!(three - two, Uint256::from_str_radix("1", 10).unwrap());
2845        assert_eq!(six / two, three);
2846        assert_eq!(seven % three, Uint256::from_str_radix("1", 10).unwrap());
2847        // BitAnd / BitOr / BitXor
2848        let five = Uint256::from_str_radix("5", 10).unwrap();  // 101
2849        let four = Uint256::from_str_radix("4", 10).unwrap();  // 100
2850        let one = Uint256::from_str_radix("1", 10).unwrap();   // 001
2851        assert_eq!(five & four, four);                       // 100
2852        assert_eq!(five | one, five);                        // 101
2853        assert_eq!(five ^ four, one);                        // 001
2854        // pow: 2^10 = 1024
2855        let p10 = two.pow(10);
2856        assert_eq!(p10, Uint256::from_str_radix("1024", 10).unwrap());
2857        // cast_signed round-trip
2858        let signed = three.cast_signed();
2859        assert_eq!(signed, Int256::from_i128(3));
2860    }
2861
2862    /// `Int256::bit` reports the two's-complement bit at any index;
2863    /// indices past the storage width return the sign bit.
2864    #[test]
2865    fn signed_bit_and_trailing_zeros() {
2866        let v = Int256::from_i128(0b1100);
2867        assert!(v.bit(2));
2868        assert!(v.bit(3));
2869        assert!(!v.bit(0));
2870        assert!(!v.bit(1));
2871        // Out-of-range bit returns the sign — non-negative for v.
2872        assert!(!v.bit(1000));
2873        // Negative input: sign bit returns true past the storage.
2874        let n = Int256::from_i128(-1);
2875        assert!(n.bit(1000));
2876        // trailing_zeros
2877        assert_eq!(Int256::from_i128(8).trailing_zeros(), 3);
2878        assert_eq!(Int256::ZERO.trailing_zeros(), Int256::BITS);
2879    }
2880}
2881
2882#[cfg(test)]
2883mod slice_tests {
2884    use super::*;
2885
2886    #[test]
2887    fn mul_and_divmod_round_trip() {
2888        let a = [123u128, 7, 0, 0];
2889        let b = [456u128, 0, 0, 0];
2890        let mut prod = [0u128; 8];
2891        limbs_mul(&a, &b, &mut prod);
2892        let mut q = [0u128; 8];
2893        let mut r = [0u128; 8];
2894        limbs_divmod(&prod, &b, &mut q, &mut r);
2895        assert_eq!(&q[..4], &a, "quotient");
2896        assert!(limbs_is_zero(&r), "remainder");
2897    }
2898
2899    #[test]
2900    fn shifts() {
2901        let a = [1u128, 0];
2902        let mut out = [0u128; 2];
2903        limbs_shl(&a, 130, &mut out);
2904        assert_eq!(out, [0, 4]);
2905        let mut back = [0u128; 2];
2906        limbs_shr(&out, 130, &mut back);
2907        assert_eq!(back, [1, 0]);
2908    }
2909
2910    #[test]
2911    fn isqrt_basic() {
2912        let n = [0u128, 0, 1, 0];
2913        let mut out = [0u128; 4];
2914        limbs_isqrt(&n, &mut out);
2915        assert_eq!(out, [0, 1, 0, 0]);
2916        let n = [144u128, 0];
2917        let mut out = [0u128; 2];
2918        limbs_isqrt(&n, &mut out);
2919        assert_eq!(out, [12, 0]);
2920        let n = [2u128, 0];
2921        let mut out = [0u128; 2];
2922        limbs_isqrt(&n, &mut out);
2923        assert_eq!(out, [1, 0]);
2924    }
2925
2926    #[test]
2927    fn add_sub_carry() {
2928        let mut a = [u128::MAX, 0];
2929        let carry = limbs_add_assign(&mut a, &[1, 0]);
2930        assert!(!carry);
2931        assert_eq!(a, [0, 1]);
2932        let borrow = limbs_sub_assign(&mut a, &[1, 0]);
2933        assert!(!borrow);
2934        assert_eq!(a, [u128::MAX, 0]);
2935    }
2936
2937    /// `div_2_by_1` matches the obvious `(high·2^128 + low) / d`
2938    /// formula on representative inputs.
2939    #[test]
2940    fn div_2_by_1_basics() {
2941        // 1 / 1 = 1 r 0.
2942        assert_eq!(div_2_by_1(0, 1, 1), (1, 0));
2943        // 5 / 2 = 2 r 1.
2944        assert_eq!(div_2_by_1(0, 5, 2), (2, 1));
2945        // (3·2^128) / 4 = 3·2^126 r 0.
2946        assert_eq!(div_2_by_1(3, 0, 4), (3 << 126, 0));
2947        // High limb just under divisor — exercises the r_top overflow
2948        // recovery in the inner loop.
2949        let d = u128::MAX - 7;
2950        let (q, r) = div_2_by_1(d - 1, u128::MAX, d);
2951        // q · d + r == (d−1)·2^128 + (2^128 − 1)
2952        let (mul_hi, mul_lo) = mul_128(q, d);
2953        let (sum_lo, c) = mul_lo.overflowing_add(r);
2954        let sum_hi = mul_hi + c as u128;
2955        assert_eq!(sum_hi, d - 1);
2956        assert_eq!(sum_lo, u128::MAX);
2957        assert!(r < d);
2958    }
2959
2960    // ── u64-primitive equivalence: u64 versions vs u128 oracle ──────
2961
2962    /// Pack a `[u128; N]` little-endian limb array into `[u64; 2*N]`.
2963    fn pack(limbs: &[u128]) -> alloc::vec::Vec<u64> {
2964        let mut out = alloc::vec![0u64; 2 * limbs.len()];
2965        for (i, &l) in limbs.iter().enumerate() {
2966            out[2 * i] = l as u64;
2967            out[2 * i + 1] = (l >> 64) as u64;
2968        }
2969        out
2970    }
2971
2972    /// Unpack a `[u64; 2*N]` slice into a `[u128; N]` vec.
2973    fn unpack(words: &[u64]) -> alloc::vec::Vec<u128> {
2974        assert!(words.len() % 2 == 0);
2975        let mut out = alloc::vec![0u128; words.len() / 2];
2976        for i in 0..out.len() {
2977            out[i] = (words[2 * i] as u128) | ((words[2 * i + 1] as u128) << 64);
2978        }
2979        out
2980    }
2981
2982    fn corpus() -> alloc::vec::Vec<alloc::vec::Vec<u128>> {
2983        alloc::vec![
2984            alloc::vec![0u128, 0, 0, 0],
2985            alloc::vec![1u128, 0, 0, 0],
2986            alloc::vec![u128::MAX, 0, 0, 0],
2987            alloc::vec![u128::MAX, u128::MAX, 0, 0],
2988            alloc::vec![u128::MAX, u128::MAX, u128::MAX, u128::MAX],
2989            alloc::vec![123u128, 456, 0, 0],
2990            alloc::vec![
2991                0x1234_5678_9abc_def0_fedc_ba98_7654_3210_u128,
2992                0xa5a5_a5a5_5a5a_5a5a_3c3c_3c3c_c3c3_c3c3,
2993                0,
2994                0,
2995            ],
2996        ]
2997    }
2998
2999    /// `limbs_mul_u64` matches `limbs_mul` after pack/unpack on the
3000    /// shared corpus.
3001    #[test]
3002    fn limbs_mul_u64_matches_u128() {
3003        for a in corpus() {
3004            for b in corpus() {
3005                let mut out128 = alloc::vec![0u128; a.len() + b.len()];
3006                limbs_mul(&a, &b, &mut out128);
3007
3008                let a64 = pack(&a);
3009                let b64 = pack(&b);
3010                let mut out64 = alloc::vec![0u64; a64.len() + b64.len()];
3011                limbs_mul_u64(&a64, &b64, &mut out64);
3012
3013                assert_eq!(unpack(&out64), out128, "limbs_mul mismatch");
3014            }
3015        }
3016    }
3017
3018    /// `limbs_mul_karatsuba_u64` matches `limbs_mul_u64` on equal-length
3019    /// operands across the carry-stressing corpus. Proves the recursive
3020    /// split + recombine algebra holds for the worst-case inputs.
3021    #[test]
3022    fn limbs_mul_karatsuba_u64_matches_schoolbook() {
3023        for a in corpus() {
3024            for b in corpus() {
3025                let a64 = pack(&a);
3026                let b64 = pack(&b);
3027                let n = a64.len().min(b64.len());
3028                if n < super::KARATSUBA_THRESHOLD_U64 {
3029                    continue;
3030                }
3031                let mut a_buf = alloc::vec![0u64; n];
3032                let mut b_buf = alloc::vec![0u64; n];
3033                a_buf.copy_from_slice(&a64[..n]);
3034                b_buf.copy_from_slice(&b64[..n]);
3035                let mut out_school = alloc::vec![0u64; 2 * n];
3036                let mut out_kara = alloc::vec![0u64; 2 * n];
3037                limbs_mul_u64(&a_buf, &b_buf, &mut out_school);
3038                limbs_mul_karatsuba_u64(&a_buf, &b_buf, &mut out_kara);
3039                assert_eq!(out_kara, out_school, "Karatsuba mismatch at n={n}");
3040            }
3041        }
3042    }
3043
3044    /// `limbs_mul_u64_fixed::<L, D>` matches `limbs_mul_u64` at
3045    /// a representative set of compile-time `L` values covering
3046    /// every wide tier (D38..D1232). Each L gets its own
3047    /// monomorphisation; the test confirms the unrolled-by-LLVM
3048    /// fixed-array path produces the same output as the slice
3049    /// path for every shape in the carry-stressing corpus.
3050    #[test]
3051    fn limbs_mul_u64_fixed_matches_slice() {
3052        macro_rules! check {
3053            ($L:expr, $D:expr) => {{
3054                for a in corpus() {
3055                    for b in corpus() {
3056                        let a64 = pack(&a);
3057                        let b64 = pack(&b);
3058                        if a64.len() < $L || b64.len() < $L {
3059                            continue;
3060                        }
3061                        let mut a_arr = [0u64; $L];
3062                        let mut b_arr = [0u64; $L];
3063                        a_arr.copy_from_slice(&a64[..$L]);
3064                        b_arr.copy_from_slice(&b64[..$L]);
3065                        let mut out_slice = alloc::vec![0u64; $D];
3066                        let mut out_fixed = [0u64; $D];
3067                        limbs_mul_u64(&a_arr, &b_arr, &mut out_slice);
3068                        limbs_mul_u64_fixed::<$L, $D>(&a_arr, &b_arr, &mut out_fixed);
3069                        assert_eq!(
3070                            &out_slice[..],
3071                            &out_fixed[..],
3072                            "limbs_mul_u64_fixed::<{}, {}> mismatch",
3073                            $L, $D
3074                        );
3075                    }
3076                }
3077            }};
3078        }
3079        check!(2, 4);
3080        check!(4, 8);
3081        check!(8, 16);
3082        check!(16, 32);
3083        check!(24, 48);
3084        check!(32, 64);
3085        check!(48, 96);
3086        check!(64, 128);
3087    }
3088
3089    /// `limbs_mul_u64_into::<L, L+1>` matches `limbs_mul_u64_fixed::<L, 2·L>`
3090    /// when the wider operand is `[n, 0, ..., 0]`, across L covering every
3091    /// wide tier from D38 (L=2) to D307 (L=16). 1000 random (a, n) pairs
3092    /// per L from a deterministic SplitMix64 stream — no run-to-run drift,
3093    /// regression-friendly. Tail-zero limbs from the wide product are
3094    /// asserted alongside the leading `L + 1` so any spurious write past
3095    /// the truncated output is caught.
3096    #[test]
3097    fn limbs_mul_u64_into_matches_fixed() {
3098        // SplitMix64 — Vigna 2014, public-domain reference algorithm.
3099        let mut state: u64 = 0xDEAD_BEEF_CAFE_F00D;
3100        let mut next = || -> u64 {
3101            state = state.wrapping_add(0x9E37_79B9_7F4A_7C15);
3102            let mut z = state;
3103            z = (z ^ (z >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9);
3104            z = (z ^ (z >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB);
3105            z ^ (z >> 31)
3106        };
3107
3108        macro_rules! check_into {
3109            ($L:expr, $LP1:expr, $D:expr) => {{
3110                for _ in 0..1000 {
3111                    let mut a = [0u64; $L];
3112                    for slot in a.iter_mut() {
3113                        *slot = next();
3114                    }
3115                    let n = next();
3116
3117                    let mut out_into = [0u64; $LP1];
3118                    limbs_mul_u64_into::<$L, $LP1>(&a, n, &mut out_into);
3119
3120                    let mut b = [0u64; $L];
3121                    b[0] = n;
3122                    let mut out_fixed = [0u64; $D];
3123                    limbs_mul_u64_fixed::<$L, $D>(&a, &b, &mut out_fixed);
3124
3125                    assert_eq!(
3126                        &out_into[..],
3127                        &out_fixed[..$LP1],
3128                        "limbs_mul_u64_into::<{}, {}> low limbs mismatch \
3129                         (a={:?}, n={:#x})",
3130                        $L, $LP1, a, n
3131                    );
3132                    for (k, &limb) in out_fixed[$LP1..].iter().enumerate() {
3133                        assert_eq!(
3134                            limb, 0,
3135                            "limbs_mul_u64_fixed high limb {} not zero \
3136                             — single-multiplier product must fit L+1 limbs",
3137                            $LP1 + k
3138                        );
3139                    }
3140                }
3141            }};
3142        }
3143        check_into!(2, 3, 4);
3144        check_into!(3, 4, 6);
3145        check_into!(4, 5, 8);
3146        check_into!(6, 7, 12);
3147        check_into!(8, 9, 16);
3148        check_into!(16, 17, 32);
3149    }
3150
3151    /// `limbs_divmod_u64` matches `limbs_divmod`.
3152    #[test]
3153    fn limbs_divmod_u64_matches_u128() {
3154        for num in corpus() {
3155            for den in corpus() {
3156                if den.iter().all(|&x| x == 0) {
3157                    continue;
3158                }
3159                let mut q128 = alloc::vec![0u128; num.len()];
3160                let mut r128 = alloc::vec![0u128; num.len()];
3161                limbs_divmod(&num, &den, &mut q128, &mut r128);
3162
3163                let n64 = pack(&num);
3164                let d64 = pack(&den);
3165                let mut q64 = alloc::vec![0u64; n64.len()];
3166                let mut r64 = alloc::vec![0u64; n64.len()];
3167                limbs_divmod_u64(&n64, &d64, &mut q64, &mut r64);
3168
3169                assert_eq!(unpack(&q64), q128, "divmod q mismatch");
3170                assert_eq!(unpack(&r64), r128, "divmod r mismatch");
3171            }
3172        }
3173    }
3174
3175    /// `limbs_divmod_knuth_u64` matches `limbs_divmod_knuth`.
3176    #[test]
3177    fn limbs_divmod_knuth_u64_matches_u128() {
3178        for num in corpus() {
3179            for den in corpus() {
3180                if den.iter().all(|&x| x == 0) {
3181                    continue;
3182                }
3183                let mut q128 = alloc::vec![0u128; num.len()];
3184                let mut r128 = alloc::vec![0u128; num.len()];
3185                limbs_divmod_knuth(&num, &den, &mut q128, &mut r128);
3186
3187                let n64 = pack(&num);
3188                let d64 = pack(&den);
3189                let mut q64 = alloc::vec![0u64; n64.len()];
3190                let mut r64 = alloc::vec![0u64; n64.len()];
3191                limbs_divmod_knuth_u64(&n64, &d64, &mut q64, &mut r64);
3192
3193                assert_eq!(unpack(&q64), q128, "knuth q mismatch");
3194                assert_eq!(unpack(&r64), r128, "knuth r mismatch");
3195            }
3196        }
3197    }
3198
3199    /// `MG3by2U64` matches the `limbs_divmod_u64` oracle on a
3200    /// representative corpus. Tests the corner cases that historically
3201    /// broke MG 3-by-2 implementations: numerator near divisor (so the
3202    /// initial q estimate may overshoot by 2), minimal normalised d1
3203    /// (= B/2), maximal d1, and the paper's worst-case corner where r1
3204    /// must compare against q_lo without underflow.
3205    #[test]
3206    fn mg3by2_u64_matches_reference() {
3207        let cases: &[(u64, u64, u64, u64, u64)] = &[
3208            // (n2, n1, n0, d1, d0) — d1 normalised, (n2, n1) < (d1, d0).
3209            // Minimal normalised d1 = B/2.
3210            (0, 0, 1, 1u64 << 63, 0),
3211            (0, 1, 0, 1u64 << 63, 0),
3212            ((1u64 << 63) - 1, u64::MAX, u64::MAX, 1u64 << 63, 1),
3213            // Maximal d1 = B-1.
3214            (u64::MAX - 1, u64::MAX, u64::MAX, u64::MAX, u64::MAX),
3215            (0, 0, 1, u64::MAX, 1),
3216            // Mid-range divisor; numerator just under (d1, d0).
3217            (0xc0ffee, 0xdead_beef, 0xface_b00c, (1u64 << 63) | 0xc0ffee_u64, 0xdead_beef_face_b00c),
3218            // Small numerator vs large divisor (quotient = 0).
3219            (0, 1, 2, (1u64 << 63) | 1, 2),
3220            // Numerator = divisor (quotient = 1, remainder = 0). Need to
3221            // express (d1, d0, 0) carefully: n2 = 0, then we'd violate
3222            // the precondition. Skip; this is a degenerate corner the
3223            // Knuth caller never hits.
3224        ];
3225        for &(n2, n1, n0, d1, d0) in cases {
3226            assert!(d1 >> 63 == 1, "d1 not normalised: {d1:#x}");
3227            assert!(
3228                n2 < d1 || (n2 == d1 && n1 < d0),
3229                "test precondition (n2, n1) < (d1, d0) violated"
3230            );
3231            let mg = MG3by2U64::new(d1, d0);
3232            let (q, r1, r0) = mg.div_rem(n2, n1, n0);
3233
3234            // Reference: 3-limb numerator / 2-limb divisor via
3235            // limbs_divmod_u64. The function requires
3236            // `rem.len() >= num.len()` so size both at 3.
3237            let num = alloc::vec![n0, n1, n2];
3238            let den = alloc::vec![d0, d1];
3239            let mut q_ref = alloc::vec![0u64; 3];
3240            let mut r_ref = alloc::vec![0u64; 3];
3241            limbs_divmod_u64(&num, &den, &mut q_ref, &mut r_ref);
3242
3243            assert_eq!(q_ref[0], q, "MG3by2 q mismatch for n=({n2:#x},{n1:#x},{n0:#x}) d=({d1:#x},{d0:#x})");
3244            assert_eq!(q_ref[1], 0, "MG3by2 q higher limb non-zero — precondition violated");
3245            assert_eq!(q_ref[2], 0, "MG3by2 q higher limb non-zero — precondition violated");
3246            assert_eq!(r_ref[0], r0, "MG3by2 r0 mismatch");
3247            assert_eq!(r_ref[1], r1, "MG3by2 r1 mismatch");
3248        }
3249    }
3250
3251    /// `MG2by1U64` matches a reference 2-by-1 divide.
3252    #[test]
3253    fn mg2by1_u64_matches_reference() {
3254        let cases: &[(u64, u64, u64)] = &[
3255            (0, 1, 1u64 << 63),
3256            (0, u64::MAX, 1u64 << 63),
3257            ((1u64 << 63) - 1, u64::MAX, 1u64 << 63),
3258            (0, 1, u64::MAX),
3259            (u64::MAX - 1, u64::MAX, u64::MAX),
3260            (12345, 67890, (1u64 << 63) | 0xdead_beef_u64),
3261            (u64::MAX - 1, 0, u64::MAX),
3262        ];
3263        for &(u1, u0, d) in cases {
3264            assert!(d >> 63 == 1);
3265            assert!(u1 < d);
3266            let mg = MG2by1U64::new(d);
3267            let (q, r) = mg.div_rem(u1, u0);
3268            // Reference: ((u1 as u128) << 64 | u0 as u128) / (d as u128)
3269            let num = ((u1 as u128) << 64) | (u0 as u128);
3270            let exp_q = (num / (d as u128)) as u64;
3271            let exp_r = (num % (d as u128)) as u64;
3272            assert_eq!((q, r), (exp_q, exp_r), "MG u64 mismatch for {u1:#x}, {u0:#x}, d={d:#x}");
3273        }
3274    }
3275
3276    /// `MG2by1::div_rem` bit-exact matches `div_2_by_1` on a
3277    /// battery of corner cases that historically broke 2-by-1 MG
3278    /// implementations: maximal numerator with minimal-normalised
3279    /// divisor (forces the +1 step's u128 overflow), boundary
3280    /// q̂ = u128::MAX, exact divides, and the smallest divisor that
3281    /// satisfies the normalisation requirement (`d` with top bit
3282    /// set and otherwise zero, i.e. `B/2`).
3283    #[test]
3284    fn mg2by1_matches_div_2_by_1() {
3285        // For each (u1, u0, d), the precondition is `u1 < d` and
3286        // `d >> 127 == 1` (normalised).
3287        let cases: &[(u128, u128, u128)] = &[
3288            // Minimal normalised divisor (d = B/2).
3289            (0, 1, 1u128 << 127),
3290            (0, u128::MAX, 1u128 << 127),
3291            ((1u128 << 127) - 1, u128::MAX, 1u128 << 127),
3292            // Maximal divisor (d = u128::MAX = B-1).
3293            (0, 1, u128::MAX),
3294            (u128::MAX - 1, u128::MAX, u128::MAX),
3295            // The exact corner case worked through in the docs:
3296            // u1 = B-2, u0 = B-1, d = B-1 → (q, r) = (B-1, B-2).
3297            (u128::MAX - 1, u128::MAX, u128::MAX),
3298            // Mid-range with mid-range divisor.
3299            (12345, 67890, (1u128 << 127) | 0xdead_beefu128),
3300            // Numerator equals divisor minus one in the high limb —
3301            // exercises the "q̂ = u128::MAX" path.
3302            (u128::MAX - 1, 0, u128::MAX),
3303            // Random spread.
3304            (0x1234_5678_9abc_def0_u128 ^ 0xa5a5, 0xfedc_ba98_7654_3210_u128, (1u128 << 127) | 0xc0ffee_u128),
3305        ];
3306        for &(u1, u0, d) in cases {
3307            assert!(d >> 127 == 1, "test divisor not normalised: {d:#x}");
3308            assert!(u1 < d, "test precondition u1 < d violated: {u1:#x} >= {d:#x}");
3309            let (q_ref, r_ref) = div_2_by_1(u1, u0, d);
3310            let mg = MG2by1::new(d);
3311            let (q_mg, r_mg) = mg.div_rem(u1, u0);
3312            assert_eq!(
3313                (q_mg, r_mg),
3314                (q_ref, r_ref),
3315                "MG2by1 disagrees with div_2_by_1 for (u1={u1:#x}, u0={u0:#x}, d={d:#x})"
3316            );
3317        }
3318    }
3319
3320    /// `limbs_divmod_knuth` agrees with the canonical `limbs_divmod`
3321    /// on a battery of representative shapes — single-limb divisors,
3322    /// multi-limb divisors, zero remainders, partial overflows in the
3323    /// q̂ refinement step.
3324    #[test]
3325    fn knuth_matches_canonical_divmod() {
3326        let cases: &[(&[u128], &[u128])] = &[
3327            // Simple
3328            (&[42], &[7]),
3329            (&[u128::MAX, 0], &[2]),
3330            // Multi-limb numerator, single-limb denominator.
3331            (&[1, 1, 0, 0], &[3]),
3332            // Multi-limb both — three-limb numerator by two-limb den.
3333            (&[u128::MAX, u128::MAX, 1, 0], &[5, 9]),
3334            // Three-limb both.
3335            (&[u128::MAX, u128::MAX, u128::MAX, 0], &[1, 2, 3]),
3336            // Numerator < denominator — quotient zero, remainder = num.
3337            (&[100, 0, 0], &[200, 0, 1]),
3338            // Equal high limbs (forces the u_top ≥ v_top branch).
3339            (
3340                &[0, 0, u128::MAX, u128::MAX],
3341                &[1, 2, u128::MAX],
3342            ),
3343        ];
3344        for (num, den) in cases {
3345            let mut q_canon = [0u128; 8];
3346            let mut r_canon = [0u128; 8];
3347            limbs_divmod(num, den, &mut q_canon, &mut r_canon);
3348            let mut q_knuth = [0u128; 8];
3349            let mut r_knuth = [0u128; 8];
3350            limbs_divmod_knuth(num, den, &mut q_knuth, &mut r_knuth);
3351            assert_eq!(q_canon, q_knuth, "quotient mismatch on {:?} / {:?}", num, den);
3352            assert_eq!(r_canon, r_knuth, "remainder mismatch on {:?} / {:?}", num, den);
3353        }
3354    }
3355
3356    /// `limbs_divmod_bz` agrees with the canonical path on
3357    /// medium-and-large operands. Recursion engages only above the
3358    /// `BZ_THRESHOLD = 8` limb cutoff.
3359    #[test]
3360    fn bz_matches_canonical_divmod() {
3361        // Builds a 16-limb dividend with a 10-limb divisor — well
3362        // above BZ_THRESHOLD so the recursive path is exercised.
3363        let mut num = [0u128; 16];
3364        for (i, slot) in num.iter_mut().enumerate() {
3365            *slot = (i as u128)
3366                .wrapping_mul(0x9E37_79B9_7F4A_7C15)
3367                .wrapping_add(i as u128);
3368        }
3369        let mut den = [0u128; 10];
3370        for (i, slot) in den.iter_mut().enumerate() {
3371            *slot = ((i + 1) as u128).wrapping_mul(0xBF58_476D_1CE4_E5B9);
3372        }
3373        let mut q_canon = [0u128; 16];
3374        let mut r_canon = [0u128; 16];
3375        limbs_divmod(&num, &den, &mut q_canon, &mut r_canon);
3376        let mut q_bz = [0u128; 16];
3377        let mut r_bz = [0u128; 16];
3378        limbs_divmod_bz(&num, &den, &mut q_bz, &mut r_bz);
3379        assert_eq!(q_canon, q_bz, "BZ quotient mismatch");
3380        assert_eq!(r_canon, r_bz, "BZ remainder mismatch");
3381    }
3382
3383    /// `limbs_mul_fast` dispatches to Karatsuba when both operands are
3384    /// equal-length ≥ `KARATSUBA_MIN`. Verify by comparing the result
3385    /// against schoolbook (`limbs_mul`) on a 16-limb pair.
3386    #[cfg(feature = "alloc")]
3387    #[test]
3388    fn fast_mul_dispatches_to_karatsuba_at_threshold() {
3389        let a: [u128; 16] = core::array::from_fn(|i| (i as u128).wrapping_mul(0xABCD) + 1);
3390        let b: [u128; 16] = core::array::from_fn(|i| (i as u128).wrapping_mul(0xBEEF) + 7);
3391        let mut fast = [0u128; 32];
3392        let mut school = [0u128; 32];
3393        limbs_mul_fast(&a, &b, &mut fast);
3394        limbs_mul(&a, &b, &mut school);
3395        assert_eq!(fast, school, "fast (Karatsuba) and schoolbook disagree");
3396    }
3397
3398    /// `limbs_mul_fast` falls through to schoolbook for unequal lengths
3399    /// or below the threshold. The 8-limb pair below skips Karatsuba.
3400    #[cfg(feature = "alloc")]
3401    #[test]
3402    fn fast_mul_falls_through_to_schoolbook_below_threshold() {
3403        let a: [u128; 8] = core::array::from_fn(|i| (i as u128).wrapping_mul(0x1234) + 1);
3404        let b: [u128; 8] = core::array::from_fn(|i| (i as u128).wrapping_mul(0x5678) + 3);
3405        let mut fast = [0u128; 16];
3406        let mut school = [0u128; 16];
3407        limbs_mul_fast(&a, &b, &mut fast);
3408        limbs_mul(&a, &b, &mut school);
3409        assert_eq!(fast, school);
3410    }
3411
3412    /// Karatsuba called directly below the threshold should still
3413    /// produce the correct product via its internal schoolbook
3414    /// fall-through. This exercises the safety branch in
3415    /// `limbs_mul_karatsuba` that zeros out and delegates back to
3416    /// schoolbook when `n < KARATSUBA_MIN`.
3417    #[cfg(feature = "alloc")]
3418    #[test]
3419    fn karatsuba_safety_fallback_below_threshold() {
3420        let a: [u128; 4] = [123, 456, 789, 0];
3421        let b: [u128; 4] = [987, 654, 321, 0];
3422        let mut karatsuba_out = [0u128; 8];
3423        let mut school_out = [0u128; 8];
3424        limbs_mul_karatsuba(&a, &b, &mut karatsuba_out);
3425        limbs_mul(&a, &b, &mut school_out);
3426        assert_eq!(karatsuba_out, school_out);
3427    }
3428
3429    /// `limbs_isqrt` of `1` returns `1` via the `bits <= 1` short-
3430    /// circuit.
3431    #[test]
3432    fn isqrt_one_short_circuit() {
3433        let n = [1u128, 0];
3434        let mut out = [0u128; 2];
3435        limbs_isqrt(&n, &mut out);
3436        assert_eq!(out, [1, 0]);
3437    }
3438
3439    /// `limbs_isqrt` of `0` returns `0` via the `bits == 0` short-
3440    /// circuit.
3441    #[test]
3442    fn isqrt_zero_short_circuit() {
3443        let n = [0u128, 0];
3444        let mut out = [0u128; 2];
3445        limbs_isqrt(&n, &mut out);
3446        assert_eq!(out, [0, 0]);
3447    }
3448
3449    /// `WideInt::from_mag_sign` for `u128` reads the first limb and
3450    /// ignores the sign flag. Exercised through a chained `wide_cast`
3451    /// `Int256 → u128`.
3452    #[test]
3453    fn wide_cast_into_u128_returns_first_limb() {
3454        let src = Int256::from_i128(123_456_789);
3455        let dst: u128 = wide_cast(src);
3456        assert_eq!(dst, 123_456_789);
3457        // Casting ZERO yields 0.
3458        let dst: u128 = wide_cast(Int256::ZERO);
3459        assert_eq!(dst, 0);
3460    }
3461
3462    /// Knuth's q̂-cap path fires when `u_top >= v_top` in the
3463    /// per-quotient-limb loop. We engineer a dividend whose normalised
3464    /// top limb equals the normalised divisor top so the cap (`q̂ =
3465    /// u128::MAX`, plus the subsequent multiply-subtract correction)
3466    /// runs, then verify the resulting quotient matches the canonical
3467    /// `limbs_divmod`.
3468    #[test]
3469    fn knuth_q_hat_cap_branch_matches_canonical() {
3470        // num top limb == den top limb; div quotient's first chunk hits
3471        // the cap. Picking the divisor's top close to u128::MAX
3472        // tightens the normalisation shift.
3473        let num: [u128; 4] = [0, 0, u128::MAX, u128::MAX >> 1];
3474        let den: [u128; 3] = [1, 2, u128::MAX >> 1];
3475        let mut q_canon = [0u128; 4];
3476        let mut r_canon = [0u128; 4];
3477        limbs_divmod(&num, &den, &mut q_canon, &mut r_canon);
3478        let mut q_knuth = [0u128; 4];
3479        let mut r_knuth = [0u128; 4];
3480        limbs_divmod_knuth(&num, &den, &mut q_knuth, &mut r_knuth);
3481        assert_eq!(q_canon, q_knuth);
3482        assert_eq!(r_canon, r_knuth);
3483    }
3484
3485    /// `limbs_divmod_bz` with a numerator that has trailing zero limbs
3486    /// strips them off in its top-non-zero scan before deciding whether
3487    /// to recurse.
3488    #[test]
3489    fn bz_strips_numerator_trailing_zeros() {
3490        // 16-limb buffer but only the low half is non-zero; den is 10 limbs.
3491        // BZ should recognise top < 2*n and fall back to Knuth.
3492        let mut num = [0u128; 16];
3493        for slot in &mut num[..8] {
3494            *slot = 0xCAFE_F00D;
3495        }
3496        let mut den = [0u128; 10];
3497        den[0] = 7;
3498        let mut q_canon = [0u128; 16];
3499        let mut r_canon = [0u128; 16];
3500        limbs_divmod(&num, &den, &mut q_canon, &mut r_canon);
3501        let mut q_bz = [0u128; 16];
3502        let mut r_bz = [0u128; 16];
3503        limbs_divmod_bz(&num, &den, &mut q_bz, &mut r_bz);
3504        assert_eq!(q_canon, q_bz);
3505        assert_eq!(r_canon, r_bz);
3506    }
3507}