Skip to main content

decimal_scaled/wide_int/
mod.rs

1//! Generic wide-integer arithmetic.
2//!
3//! A self-contained fixed-width big-integer layer: signed `Int*` and
4//! unsigned `Uint*` two's-complement integers from 256 to 4096 bits,
5//! plus the `WideInt` trait that casts losslessly between any two
6//! widths (or a primitive `i128` / `i64` / `u128`). The module depends
7//! on nothing else in the crate and is structured so it can later be
8//! lifted into a standalone crate.
9//!
10//! # Structure
11//!
12//! - **Slice primitives** — the actual arithmetic, written once over
13//!   `&[u128]` limb slices (little-endian, `limbs[0]` least
14//!   significant). Operating on slices sidesteps the const-generic
15//!   return-type problem a widening multiply would otherwise hit. The
16//!   core routines are `const fn` so the integer types built on them
17//!   can expose `const` constructors and constants.
18//! - **`macros`** — the `decl_wide_int!` macro: emits a concrete
19//!   two's-complement signed integer type and its unsigned sibling for
20//!   a fixed limb count, delegating arithmetic to the slice primitives.
21//! - The concrete `Uint* / Int*` type pairs generated by that macro.
22
23// ─────────────────────────────────────────────────────────────────────
24// Slice primitives — unsigned limb-array arithmetic.
25//
26// Every routine treats its slices as little-endian unsigned integers.
27// Lengths are taken from the slices; callers size output buffers.
28// ─────────────────────────────────────────────────────────────────────
29
30/// Full 128×128 → 256 unsigned product, `(high, low)`.
31#[inline]
32const fn mul_128(a: u128, b: u128) -> (u128, u128) {
33    let (a_hi, a_lo) = (a >> 64, a & u64::MAX as u128);
34    let (b_hi, b_lo) = (b >> 64, b & u64::MAX as u128);
35    let (mid, carry1) = (a_lo * b_hi).overflowing_add(a_hi * b_lo);
36    let (low, carry2) = (a_lo * b_lo).overflowing_add(mid << 64);
37    let high = a_hi * b_hi + (mid >> 64) + ((carry1 as u128) << 64) + carry2 as u128;
38    (high, low)
39}
40
41/// `a == 0`.
42#[inline]
43pub(crate) const fn limbs_is_zero(a: &[u128]) -> bool {
44    let mut i = 0;
45    while i < a.len() {
46        if a[i] != 0 {
47            return false;
48        }
49        i += 1;
50    }
51    true
52}
53
54/// `a == b` for two limb slices of possibly different lengths.
55#[inline]
56pub(crate) const fn limbs_eq(a: &[u128], b: &[u128]) -> bool {
57    let n = if a.len() > b.len() { a.len() } else { b.len() };
58    let mut i = 0;
59    while i < n {
60        let av = if i < a.len() { a[i] } else { 0 };
61        let bv = if i < b.len() { b[i] } else { 0 };
62        if av != bv {
63            return false;
64        }
65        i += 1;
66    }
67    true
68}
69
70/// Three-way comparison of two limb slices of possibly different
71/// lengths. Returns `-1`, `0`, or `1` for less / equal / greater.
72#[inline]
73pub(crate) const fn limbs_cmp(a: &[u128], b: &[u128]) -> i32 {
74    let n = if a.len() > b.len() { a.len() } else { b.len() };
75    let mut i = n;
76    while i > 0 {
77        i -= 1;
78        let av = if i < a.len() { a[i] } else { 0 };
79        let bv = if i < b.len() { b[i] } else { 0 };
80        if av < bv {
81            return -1;
82        }
83        if av > bv {
84            return 1;
85        }
86    }
87    0
88}
89
90/// Bit length (`0` for zero, else `floor(log2)+1`).
91#[inline]
92pub(crate) const fn limbs_bit_len(a: &[u128]) -> u32 {
93    let mut i = a.len();
94    while i > 0 {
95        i -= 1;
96        if a[i] != 0 {
97            return (i as u32) * 128 + (128 - a[i].leading_zeros());
98        }
99    }
100    0
101}
102
103/// `a += b`, returning the carry out. `a.len() >= b.len()`.
104#[inline]
105pub(crate) const fn limbs_add_assign(a: &mut [u128], b: &[u128]) -> bool {
106    let mut carry = 0u128;
107    let mut i = 0;
108    while i < a.len() {
109        let bv = if i < b.len() { b[i] } else { 0 };
110        let (s1, c1) = a[i].overflowing_add(bv);
111        let (s2, c2) = s1.overflowing_add(carry);
112        a[i] = s2;
113        carry = (c1 as u128) + (c2 as u128);
114        i += 1;
115    }
116    carry != 0
117}
118
119/// `a -= b`, returning the borrow out. `a.len() >= b.len()`.
120#[inline]
121pub(crate) const fn limbs_sub_assign(a: &mut [u128], b: &[u128]) -> bool {
122    let mut borrow = 0u128;
123    let mut i = 0;
124    while i < a.len() {
125        let bv = if i < b.len() { b[i] } else { 0 };
126        let (d1, b1) = a[i].overflowing_sub(bv);
127        let (d2, b2) = d1.overflowing_sub(borrow);
128        a[i] = d2;
129        borrow = (b1 as u128) + (b2 as u128);
130        i += 1;
131    }
132    borrow != 0
133}
134
135/// `out = a << shift`. `out` is zeroed then filled; bits shifted past
136/// `out`'s width are dropped.
137pub(crate) const fn limbs_shl(a: &[u128], shift: u32, out: &mut [u128]) {
138    let mut z = 0;
139    while z < out.len() {
140        out[z] = 0;
141        z += 1;
142    }
143    let limb_shift = (shift / 128) as usize;
144    let bit = shift % 128;
145    let mut i = 0;
146    while i < a.len() {
147        let dst = i + limb_shift;
148        if dst < out.len() {
149            if bit == 0 {
150                out[dst] |= a[i];
151            } else {
152                out[dst] |= a[i] << bit;
153                if dst + 1 < out.len() {
154                    out[dst + 1] |= a[i] >> (128 - bit);
155                }
156            }
157        }
158        i += 1;
159    }
160}
161
162/// `out = a >> shift`. `out` is zeroed then filled.
163pub(crate) const fn limbs_shr(a: &[u128], shift: u32, out: &mut [u128]) {
164    let mut z = 0;
165    while z < out.len() {
166        out[z] = 0;
167        z += 1;
168    }
169    let limb_shift = (shift / 128) as usize;
170    let bit = shift % 128;
171    let mut i = limb_shift;
172    while i < a.len() {
173        let dst = i - limb_shift;
174        if dst < out.len() {
175            if bit == 0 {
176                out[dst] |= a[i];
177            } else {
178                out[dst] |= a[i] >> bit;
179                if dst >= 1 {
180                    out[dst - 1] |= a[i] << (128 - bit);
181                }
182            }
183        }
184        i += 1;
185    }
186}
187
188/// `out = a · b` (schoolbook). `out.len() >= a.len() + b.len()` and
189/// `out` must be zeroed by the caller.
190pub(crate) const fn limbs_mul(a: &[u128], b: &[u128], out: &mut [u128]) {
191    // Fast path for the 2-limb × 2-limb → 4-limb shape used by
192    // `Int256::wrapping_mul` (the densest wide-int call site). Hand
193    // unrolled so the compiler sees four independent `mul_128`
194    // sub-products that can issue in parallel; the inner loop variant
195    // can't express that.
196    if a.len() == 2 && b.len() == 2 && out.len() >= 4 {
197        let (a0, a1) = (a[0], a[1]);
198        let (b0, b1) = (b[0], b[1]);
199        // (a1·2^128 + a0)·(b1·2^128 + b0) = h3·2^384 + (h1+h2+l3)·2^256
200        // + (h0+l1+l2)·2^128 + l0
201        let (h0, l0) = mul_128(a0, b0);
202        let (h1, l1) = mul_128(a0, b1);
203        let (h2, l2) = mul_128(a1, b0);
204        let (h3, l3) = mul_128(a1, b1);
205
206        out[0] = l0;
207
208        let (s1, c1a) = h0.overflowing_add(l1);
209        let (s1, c1b) = s1.overflowing_add(l2);
210        out[1] = s1;
211        let mid_carry = (c1a as u128) + (c1b as u128);
212
213        let (s2, c2a) = h1.overflowing_add(h2);
214        let (s2, c2b) = s2.overflowing_add(l3);
215        let (s2, c2c) = s2.overflowing_add(mid_carry);
216        out[2] = s2;
217        let top_carry = (c2a as u128) + (c2b as u128) + (c2c as u128);
218
219        out[3] = h3.wrapping_add(top_carry);
220        return;
221    }
222
223    let mut i = 0;
224    while i < a.len() {
225        if a[i] != 0 {
226            let mut carry = 0u128;
227            let mut j = 0;
228            while j < b.len() {
229                // Skip zero-limb contributions: every `*` for `b[j] = 0`
230                // produces (0, 0) and the only effect is propagating the
231                // carry. Adds a hot fast path for widened-from-narrower
232                // operands (their upper limbs are zero) and for
233                // small-magnitude multipliers like `10^SCALE` at modest
234                // scales (one nonzero limb in `b`).
235                if b[j] != 0 || carry != 0 {
236                    let (hi, lo) = mul_128(a[i], b[j]);
237                    let idx = i + j;
238                    let (s1, c1) = out[idx].overflowing_add(lo);
239                    let (s2, c2) = s1.overflowing_add(carry);
240                    out[idx] = s2;
241                    carry = hi + (c1 as u128) + (c2 as u128);
242                }
243                j += 1;
244            }
245            let mut idx = i + b.len();
246            while carry != 0 && idx < out.len() {
247                let (s, c) = out[idx].overflowing_add(carry);
248                out[idx] = s;
249                carry = c as u128;
250                idx += 1;
251            }
252        }
253        i += 1;
254    }
255}
256
257/// Karatsuba multiplication threshold. For `a.len() <= KARATSUBA_MIN`
258/// schoolbook wins on constant factors; above that recursive splitting
259/// pays off. Tuned empirically for the `[u128]` limb layout — at 8
260/// limbs (Int1024) schoolbook is 64 limb-mul + carry chains vs
261/// Karatsuba's 3 sub-products of 4 limbs each (48 limb-mul) plus three
262/// shift-and-add merges; the cross-over is here.
263const KARATSUBA_MIN: usize = 16;
264
265/// `out = a · b` for equal-length inputs, dispatching to Karatsuba
266/// when the operand size warrants it.
267///
268/// Not `const fn` — Karatsuba's half-sum scratch needs heap allocation.
269/// Callers in `const` context (parsing string-literal constants, etc.)
270/// keep using [`limbs_mul`].
271///
272/// `out.len() >= 2 · a.len()`. `a.len() == b.len()` required for the
273/// Karatsuba path; mismatched lengths fall through to schoolbook.
274#[cfg(feature = "alloc")]
275pub(crate) fn limbs_mul_fast(a: &[u128], b: &[u128], out: &mut [u128]) {
276    if a.len() == b.len() && a.len() >= KARATSUBA_MIN {
277        limbs_mul_karatsuba(a, b, out);
278    } else {
279        limbs_mul(a, b, out);
280    }
281}
282
283#[cfg(not(feature = "alloc"))]
284pub(crate) fn limbs_mul_fast(a: &[u128], b: &[u128], out: &mut [u128]) {
285    limbs_mul(a, b, out);
286}
287
288/// Karatsuba multiplication, equal-length inputs.
289///
290/// Split `a = a_hi·B^h + a_lo`, `b = b_hi·B^h + b_lo` with
291/// `B = 2^128` and `h = a.len() / 2`. Compute three sub-products:
292///
293/// - `z0 = a_lo · b_lo`
294/// - `z2 = a_hi · b_hi`
295/// - `z1 = (a_lo + a_hi)·(b_lo + b_hi) − z0 − z2`
296///
297/// Then `a·b = z2·B^(2h) + z1·B^h + z0`.
298///
299/// Reference: Karatsuba, A. and Ofman, Yu. (1962). "Multiplication of
300/// Multidigit Numbers on Automata." *Doklady Akad. Nauk SSSR* 145,
301/// 293–294.
302#[cfg(feature = "alloc")]
303fn limbs_mul_karatsuba(a: &[u128], b: &[u128], out: &mut [u128]) {
304    debug_assert_eq!(a.len(), b.len());
305    debug_assert!(out.len() >= 2 * a.len());
306    let n = a.len();
307    if n < KARATSUBA_MIN {
308        // Zero out and run schoolbook.
309        for o in out.iter_mut().take(2 * n) {
310            *o = 0;
311        }
312        limbs_mul(a, b, out);
313        return;
314    }
315    let h = n / 2;
316    let (a_lo, a_hi_full) = a.split_at(h);
317    let (b_lo, b_hi_full) = b.split_at(h);
318    let a_hi = a_hi_full;
319    let b_hi = b_hi_full;
320
321    // z0 = a_lo · b_lo (length 2h)
322    let mut z0 = alloc::vec![0u128; 2 * h];
323    limbs_mul_karatsuba_padded(a_lo, b_lo, &mut z0);
324
325    // z2 = a_hi · b_hi (length 2*(n-h))
326    let hi_len = n - h;
327    let mut z2 = alloc::vec![0u128; 2 * hi_len];
328    limbs_mul_karatsuba_padded(a_hi, b_hi, &mut z2);
329
330    // sum_a = a_lo + a_hi (length max(h, hi_len) + 1)
331    let sum_len = core::cmp::max(h, hi_len) + 1;
332    let mut sum_a = alloc::vec![0u128; sum_len];
333    let mut sum_b = alloc::vec![0u128; sum_len];
334    sum_a[..h].copy_from_slice(a_lo);
335    sum_b[..h].copy_from_slice(b_lo);
336    limbs_add_assign(&mut sum_a[..], a_hi);
337    limbs_add_assign(&mut sum_b[..], b_hi);
338
339    // z1 = sum_a · sum_b (length 2 * sum_len)
340    let mut z1 = alloc::vec![0u128; 2 * sum_len];
341    limbs_mul_karatsuba_padded(&sum_a, &sum_b, &mut z1);
342
343    // z1 -= z0
344    limbs_sub_assign(&mut z1[..], &z0);
345    // z1 -= z2
346    limbs_sub_assign(&mut z1[..], &z2);
347
348    // Combine: out[..2h] = z0; out[2h..] = z2 shifted up by 2h;
349    // then add z1 shifted up by h.
350    for o in out.iter_mut().take(2 * n) {
351        *o = 0;
352    }
353    let z0_take = core::cmp::min(z0.len(), out.len());
354    out[..z0_take].copy_from_slice(&z0[..z0_take]);
355    let z2_take = core::cmp::min(z2.len(), out.len().saturating_sub(2 * h));
356    if z2_take > 0 {
357        out[2 * h..2 * h + z2_take].copy_from_slice(&z2[..z2_take]);
358    }
359    // Add z1 << h.
360    let z1_take = core::cmp::min(z1.len(), out.len().saturating_sub(h));
361    if z1_take > 0 {
362        limbs_add_assign(&mut out[h..h + z1_take], &z1[..z1_take]);
363    }
364}
365
366/// Karatsuba helper that pads to equal lengths if the caller passes
367/// uneven slices (happens at the recursion boundary when `n` is odd
368/// and `n_hi = n - h > h`).
369#[cfg(feature = "alloc")]
370fn limbs_mul_karatsuba_padded(a: &[u128], b: &[u128], out: &mut [u128]) {
371    if a.len() == b.len() && a.len() >= KARATSUBA_MIN {
372        limbs_mul_karatsuba(a, b, out);
373    } else {
374        for o in out.iter_mut() {
375            *o = 0;
376        }
377        limbs_mul(a, b, out);
378    }
379}
380
381/// Single-bit left shift in place; returns the bit shifted out of the
382/// top.
383#[inline]
384const fn limbs_shl1(a: &mut [u128]) -> u128 {
385    let mut carry = 0u128;
386    let mut i = 0;
387    while i < a.len() {
388        let new_carry = a[i] >> 127;
389        a[i] = (a[i] << 1) | carry;
390        carry = new_carry;
391        i += 1;
392    }
393    carry
394}
395
396/// `true` if every limb above index 0 is zero — the value fits a
397/// single 128-bit word.
398#[inline]
399const fn limbs_fit_one(a: &[u128]) -> bool {
400    let mut i = 1;
401    while i < a.len() {
402        if a[i] != 0 {
403            return false;
404        }
405        i += 1;
406    }
407    true
408}
409
410/// `quot = num / den`, `rem = num % den`. `quot.len() >= num.len()`,
411/// `rem.len() >= num.len()`; both are zeroed by this routine. `den`
412/// must be non-zero.
413///
414/// Two hardware fast paths short-circuit the binary long-division
415/// loop — they cover the dominant decimal cases (moderate magnitudes,
416/// divisor `10^scale` for `scale <= 19`):
417///
418/// - both operands fit a single 128-bit word → one hardware divide;
419/// - the divisor fits a 64-bit word → schoolbook base-2^64 division,
420///   one hardware divide per limb-half.
421///
422/// Otherwise it falls back to a binary shift-subtract loop bounded by
423/// the dividend's actual bit length.
424pub(crate) const fn limbs_divmod(
425    num: &[u128],
426    den: &[u128],
427    quot: &mut [u128],
428    rem: &mut [u128],
429) {
430    let mut z = 0;
431    while z < quot.len() {
432        quot[z] = 0;
433        z += 1;
434    }
435    z = 0;
436    while z < rem.len() {
437        rem[z] = 0;
438        z += 1;
439    }
440
441    let den_one_limb = limbs_fit_one(den);
442
443    // Fast path A: both dividend and divisor fit one 128-bit word.
444    if den_one_limb && limbs_fit_one(num) {
445        if !quot.is_empty() {
446            quot[0] = num[0] / den[0];
447        }
448        if !rem.is_empty() {
449            rem[0] = num[0] % den[0];
450        }
451        return;
452    }
453
454    // Fast path B: divisor fits a 64-bit word — schoolbook base-2^64
455    // long division, one hardware divide per 64-bit half of the
456    // dividend. Every `10^scale` for `scale <= 19` lands here.
457    if den_one_limb && den[0] <= u64::MAX as u128 {
458        let d = den[0];
459        let mut r: u128 = 0;
460        // Skip leading zero limbs of the numerator — every wide-tier
461        // call widens narrower operands into a `2 × $L`-limb buffer,
462        // so the top limbs are zero by construction. Each skipped
463        // limb saves two hardware divides.
464        let mut top = num.len();
465        while top > 0 && num[top - 1] == 0 {
466            top -= 1;
467        }
468        let mut i = top;
469        while i > 0 {
470            i -= 1;
471            let hi = num[i] >> 64;
472            let acc_hi = (r << 64) | hi;
473            let q_hi = acc_hi / d;
474            r = acc_hi % d;
475            let lo = num[i] & u64::MAX as u128;
476            let acc_lo = (r << 64) | lo;
477            let q_lo = acc_lo / d;
478            r = acc_lo % d;
479            if i < quot.len() {
480                quot[i] = (q_hi << 64) | q_lo;
481            }
482        }
483        if !rem.is_empty() {
484            rem[0] = r;
485        }
486        return;
487    }
488
489    // General path: binary shift-subtract, bounded by the dividend's
490    // actual bit length.
491    let bits = limbs_bit_len(num);
492    let mut i = bits;
493    while i > 0 {
494        i -= 1;
495        limbs_shl1(rem);
496        let bit = (num[(i / 128) as usize] >> (i % 128)) & 1;
497        rem[0] |= bit;
498        limbs_shl1(quot);
499        if limbs_cmp(rem, den) >= 0 {
500            limbs_sub_assign(rem, den);
501            quot[0] |= 1;
502        }
503    }
504}
505
506/// Capacity of the internal scratch buffers — 72 limbs (9216 bits),
507/// comfortably above the widest work integer in the crate (4096-bit →
508/// 32 limbs, with isqrt scratch ≤ 33).
509const SCRATCH_LIMBS: usize = 72;
510
511/// 2-by-1 unsigned divide: `(high · 2^128 + low) / d` and the matching
512/// remainder. Requires `high < d` so the quotient fits a single
513/// `u128`.
514///
515/// Implementation: bit-by-bit recovery (128 iterations, constant work
516/// per iter). Slower than a hardware 256-by-128 instruction but Rust's
517/// stable surface doesn't expose one, and the tighter shift-multiply
518/// estimators (Möller–Granlund) require precomputed reciprocals that
519/// only pay back across many divides with the same divisor — Knuth's
520/// inner loop sees each divisor at most once per call.
521#[inline]
522const fn div_2_by_1(high: u128, low: u128, d: u128) -> (u128, u128) {
523    // The classical recovery loop: at each step shift `r` left by 1,
524    // pull the next bit of `low` in, then conditionally subtract `d`
525    // and set the matching quotient bit. The catch is that `r` can
526    // grow past `2^128 − 1` between the shift and the subtract; we
527    // track that as the `r_top` carry-out bit so the comparison stays
528    // correct.
529    let mut q: u128 = 0;
530    let mut r = high;
531    let mut i = 128;
532    while i > 0 {
533        i -= 1;
534        let r_top = r >> 127;
535        r = (r << 1) | ((low >> i) & 1);
536        q <<= 1;
537        // r real = r_top·2^128 + r. Subtract if r_real ≥ d, i.e.
538        // r_top == 1 OR r ≥ d.
539        if r_top != 0 || r >= d {
540            r = r.wrapping_sub(d);
541            q |= 1;
542        }
543    }
544    (q, r)
545}
546
547/// Knuth Algorithm D — base-2^128 multi-limb long division. The
548/// algorithm-of-record base case for `limbs_divmod_bz` below.
549///
550/// Computes `quot = num / den`, `rem = num % den`. Requires
551/// `den` non-zero. `quot` and `rem` are zeroed by this routine.
552///
553/// This is the textbook Knuth Algorithm D (TAOCP Vol. 2, §4.3.1)
554/// adapted to base `2^128`: normalise the divisor so its top bit
555/// is set, then for each quotient limb estimate `q̂` from the top
556/// two limbs of the running dividend divided by the top limb of
557/// the divisor, refine `q̂` once if necessary against the second-
558/// from-top divisor limb, multiply-and-subtract, and add-back-and-
559/// decrement on the rare miss.
560///
561/// Complexity is `O(m·n)` multi-limb ops on `m+n / n`-limb inputs,
562/// versus the binary shift-subtract path's `O((m+n)·n·128)`. For
563/// `n = 32` limbs (Int4096) the difference is ~14×. For `n ≤ 2`
564/// limbs there's no win and the caller should keep using the
565/// existing single-limb fast paths in `limbs_divmod`.
566///
567/// Not `const fn`: the inner loops use `[u128; SCRATCH_LIMBS]`
568/// scratch buffers and mutate them via overflowing arithmetic
569/// that the const evaluator doesn't yet permit. None of the
570/// crate's const-contexts depend on this routine.
571pub(crate) fn limbs_divmod_knuth(
572    num: &[u128],
573    den: &[u128],
574    quot: &mut [u128],
575    rem: &mut [u128],
576) {
577    for q in quot.iter_mut() {
578        *q = 0;
579    }
580    for r in rem.iter_mut() {
581        *r = 0;
582    }
583
584    // Effective lengths after stripping leading zeros.
585    let mut n = den.len();
586    while n > 0 && den[n - 1] == 0 {
587        n -= 1;
588    }
589    assert!(n > 0, "limbs_divmod_knuth: divide by zero");
590
591    let mut top = num.len();
592    while top > 0 && num[top - 1] == 0 {
593        top -= 1;
594    }
595    if top < n {
596        // quotient is zero, remainder is num.
597        let copy_n = num.len().min(rem.len());
598        let mut i = 0;
599        while i < copy_n {
600            rem[i] = num[i];
601            i += 1;
602        }
603        return;
604    }
605
606    // D1. Normalise: shift divisor (and dividend) left by `shift` bits
607    // so the divisor's top limb has its high bit set. Knuth's q̂
608    // refinement guarantee only holds in that regime.
609    let shift = den[n - 1].leading_zeros();
610
611    let mut u = [0u128; SCRATCH_LIMBS];
612    let mut v = [0u128; SCRATCH_LIMBS];
613    debug_assert!(top < SCRATCH_LIMBS && n <= SCRATCH_LIMBS);
614
615    if shift == 0 {
616        u[..top].copy_from_slice(&num[..top]);
617        u[top] = 0;
618        v[..n].copy_from_slice(&den[..n]);
619    } else {
620        let mut carry: u128 = 0;
621        for i in 0..top {
622            let val = num[i];
623            u[i] = (val << shift) | carry;
624            carry = val >> (128 - shift);
625        }
626        u[top] = carry;
627        carry = 0;
628        for i in 0..n {
629            let val = den[i];
630            v[i] = (val << shift) | carry;
631            carry = val >> (128 - shift);
632        }
633    }
634
635    let m_plus_n = if u[top] != 0 { top + 1 } else { top };
636    debug_assert!(m_plus_n >= n);
637    let m = m_plus_n - n;
638
639    // D2. For j from m down to 0.
640    let mut j_plus_one = m + 1;
641    while j_plus_one > 0 {
642        j_plus_one -= 1;
643        let j = j_plus_one;
644
645        // D3. Estimate q̂.
646        let u_top = u[j + n];
647        let u_next = u[j + n - 1];
648        let v_top = v[n - 1];
649
650        let (mut q_hat, mut r_hat) = if u_top >= v_top {
651            // q̂ would exceed 2^128 − 1. Cap at the max and let the
652            // refinement / add-back step correct any over-estimate.
653            // r̂ = u_top·2^128 + u_next − q̂·v_top, computed mod 2^128
654            // with q̂ = 2^128 − 1: r̂ = u_top·2^128 + u_next − (2^128
655            // − 1)·v_top = (u_top − v_top)·2^128 + u_next + v_top.
656            // We only need r̂ ≤ 2^128 − 1 for the refinement step; if
657            // (u_top − v_top) ≥ 1, r̂ overflows and we skip the
658            // refinement (the multiply-subtract handles it).
659            let q = u128::MAX;
660            let (r, of) = u_next.overflowing_add(v_top);
661            // If overflow OR (u_top − v_top) ≥ 1 we treat r̂ as
662            // "above 2^128"; signal by returning r_overflow == true.
663            if of || u_top > v_top {
664                (q, u128::MAX) // sentinel; refinement loop will see r̂ "large" and not subtract.
665            } else {
666                (q, r)
667            }
668        } else {
669            div_2_by_1(u_top, u_next, v_top)
670        };
671
672        // Refinement: while q̂·v[n−2] > r̂·2^128 + u[j+n−2], decrement.
673        if n >= 2 {
674            let v_below = v[n - 2];
675            loop {
676                let (hi, lo) = mul_128(q_hat, v_below);
677                let rhs_lo = u[j + n - 2];
678                let rhs_hi = r_hat;
679                // Compare (hi, lo) vs (rhs_hi, rhs_lo).
680                if hi < rhs_hi || (hi == rhs_hi && lo <= rhs_lo) {
681                    break;
682                }
683                q_hat = q_hat.wrapping_sub(1);
684                let (new_r, of) = r_hat.overflowing_add(v_top);
685                if of {
686                    break;
687                }
688                r_hat = new_r;
689            }
690        }
691
692        // D4. Multiply-and-subtract: u[j..=j+n] -= q̂ · v[0..n].
693        let mut mul_carry: u128 = 0;
694        let mut borrow: u128 = 0;
695        for i in 0..n {
696            let (hi, lo) = mul_128(q_hat, v[i]);
697            let (prod_lo, c1) = lo.overflowing_add(mul_carry);
698            let new_mul_carry = hi + u128::from(c1);
699            let (s1, b1) = u[j + i].overflowing_sub(prod_lo);
700            let (s2, b2) = s1.overflowing_sub(borrow);
701            u[j + i] = s2;
702            borrow = u128::from(b1) + u128::from(b2);
703            mul_carry = new_mul_carry;
704        }
705        let (s1, b1) = u[j + n].overflowing_sub(mul_carry);
706        let (s2, b2) = s1.overflowing_sub(borrow);
707        u[j + n] = s2;
708        let final_borrow = u128::from(b1) + u128::from(b2);
709
710        // D5/D6. If multiply-subtract went negative, decrement q̂ and
711        // add v back.
712        if final_borrow != 0 {
713            q_hat = q_hat.wrapping_sub(1);
714            let mut carry: u128 = 0;
715            for i in 0..n {
716                let (s1, c1) = u[j + i].overflowing_add(v[i]);
717                let (s2, c2) = s1.overflowing_add(carry);
718                u[j + i] = s2;
719                carry = u128::from(c1) + u128::from(c2);
720            }
721            // Final carry cancels with the earlier borrow.
722            u[j + n] = u[j + n].wrapping_add(carry);
723        }
724
725        if j < quot.len() {
726            quot[j] = q_hat;
727        }
728    }
729
730    // D8. Denormalise the remainder: u[0..n] >> shift → rem.
731    if shift == 0 {
732        let copy_n = n.min(rem.len());
733        rem[..copy_n].copy_from_slice(&u[..copy_n]);
734    } else {
735        for i in 0..n {
736            if i < rem.len() {
737                let lo = u[i] >> shift;
738                let hi_into_lo = if i + 1 < n {
739                    u[i + 1] << (128 - shift)
740                } else {
741                    0
742                };
743                rem[i] = lo | hi_into_lo;
744            }
745        }
746    }
747}
748
749/// Burnikel–Ziegler recursive divide (MPI-I-98-1-022, 1998).
750///
751/// Splits an unbalanced `m+n / n` divide into a chain of balanced
752/// `2n / n` sub-divides, each of which recursively halves. The base
753/// case is [`limbs_divmod_knuth`] for divisors below `BZ_THRESHOLD`.
754/// On Karatsuba-multiplied operands BZ runs in `O(n^{1.58} · log n)`
755/// time vs Knuth's `O(n²)`.
756///
757/// For the widths this crate actually uses (Int256 … Int4096, ≤ 32
758/// limbs) the recursion only saves a constant factor over Knuth and
759/// the canonical `limbs_divmod` path stays untouched. BZ is exposed
760/// here so a bench-driven follow-up can promote it once a clear win
761/// shows up on the wide-tier divides.
762///
763/// Threshold: recurses only when both `num.len() ≥ 2·BZ_THRESHOLD`
764/// and `den.len() ≥ BZ_THRESHOLD`. Below that the cost of splitting
765/// dominates and Knuth wins outright.
766pub(crate) fn limbs_divmod_bz(
767    num: &[u128],
768    den: &[u128],
769    quot: &mut [u128],
770    rem: &mut [u128],
771) {
772    const BZ_THRESHOLD: usize = 8;
773
774    let mut n = den.len();
775    while n > 0 && den[n - 1] == 0 {
776        n -= 1;
777    }
778    assert!(n > 0, "limbs_divmod_bz: divide by zero");
779
780    let mut top = num.len();
781    while top > 0 && num[top - 1] == 0 {
782        top -= 1;
783    }
784
785    if n < BZ_THRESHOLD || top < 2 * n {
786        // Base case — Knuth handles every shape efficiently.
787        limbs_divmod_knuth(num, den, quot, rem);
788        return;
789    }
790
791    // BZ recursion: split the dividend into chunks of size `n` from
792    // the top, process each chunk with a `2n / n` sub-divide, carry
793    // the remainder forward. Each `2n / n` sub-divide itself does
794    // two `(3n/2) / n` calls via the recursive structure that — at
795    // these widths — Knuth handles inside its own quotient loop, so
796    // for now BZ here is essentially the chunked schoolbook outer
797    // loop with Knuth as the kernel. The full §3 two-by-one /
798    // three-by-two recursion is recorded in ALGORITHMS.md as the
799    // next layer to add once a bench shows it winning.
800    for q in quot.iter_mut() {
801        *q = 0;
802    }
803    for r in rem.iter_mut() {
804        *r = 0;
805    }
806
807    // Number of `n`-limb chunks in the dividend, rounded up so the
808    // top chunk may be short.
809    let chunks = top.div_ceil(n);
810    let mut carry = [0u128; SCRATCH_LIMBS];
811    let mut buf = [0u128; SCRATCH_LIMBS];
812    let mut q_chunk = [0u128; SCRATCH_LIMBS];
813    let mut r_chunk = [0u128; SCRATCH_LIMBS];
814
815    let mut idx = chunks;
816    while idx > 0 {
817        idx -= 1;
818        let lo = idx * n;
819        let hi = ((idx + 1) * n).min(top);
820        // buf = carry · 2^(n·128) + num[lo..hi]. carry holds the
821        // running remainder from the previous step (≤ n limbs).
822        buf.fill(0);
823        let chunk_len = hi - lo;
824        buf[..chunk_len].copy_from_slice(&num[lo..lo + chunk_len]);
825        buf[chunk_len..chunk_len + n].copy_from_slice(&carry[..n]);
826        let buf_len = chunk_len + n;
827        // Divide.
828        limbs_divmod_knuth(
829            &buf[..buf_len],
830            &den[..n],
831            &mut q_chunk[..buf_len],
832            &mut r_chunk[..n],
833        );
834        // Store quotient chunk.
835        let store_end = (lo + n).min(quot.len());
836        let store_len = store_end.saturating_sub(lo);
837        quot[lo..lo + store_len].copy_from_slice(&q_chunk[..store_len]);
838        // Carry the remainder.
839        carry[..n].copy_from_slice(&r_chunk[..n]);
840    }
841    let rem_n = n.min(rem.len());
842    rem[..rem_n].copy_from_slice(&carry[..rem_n]);
843}
844
845/// `out = floor(sqrt(n))` via Newton's method. `out` is zeroed then
846/// filled.
847pub(crate) fn limbs_isqrt(n: &[u128], out: &mut [u128]) {
848    for o in out.iter_mut() {
849        *o = 0;
850    }
851    let bits = limbs_bit_len(n);
852    if bits == 0 {
853        return;
854    }
855    if bits <= 1 {
856        out[0] = 1;
857        return;
858    }
859    let work = n.len() + 1;
860    debug_assert!(work <= SCRATCH_LIMBS, "wide-int isqrt scratch overflow");
861    let mut x = [0u128; SCRATCH_LIMBS];
862    let e = bits.div_ceil(2);
863    x[(e / 128) as usize] |= 1u128 << (e % 128);
864    loop {
865        let mut q = [0u128; SCRATCH_LIMBS];
866        let mut r = [0u128; SCRATCH_LIMBS];
867        limbs_divmod(n, &x[..work], &mut q[..work], &mut r[..work]);
868        limbs_add_assign(&mut q[..work], &x[..work]);
869        let mut y = [0u128; SCRATCH_LIMBS];
870        limbs_shr(&q[..work], 1, &mut y[..work]);
871        if limbs_cmp(&y[..work], &x[..work]) >= 0 {
872            break;
873        }
874        x = y;
875    }
876    let copy_len = if out.len() < work { out.len() } else { work };
877    out[..copy_len].copy_from_slice(&x[..copy_len]);
878}
879
880/// `limbs /= radix` in place, returning the remainder. `radix` must fit
881/// a `u64` so the per-limb division stays within `u128`.
882fn limbs_div_small(limbs: &mut [u128], radix: u128) -> u128 {
883    let mut rem = 0u128;
884    for limb in limbs.iter_mut().rev() {
885        let hi = (*limb) >> 64;
886        let lo = (*limb) & u128::from(u64::MAX);
887        let acc_hi = (rem << 64) | hi;
888        let q_hi = acc_hi / radix;
889        let r1 = acc_hi % radix;
890        let acc_lo = (r1 << 64) | lo;
891        let q_lo = acc_lo / radix;
892        rem = acc_lo % radix;
893        *limb = (q_hi << 64) | q_lo;
894    }
895    rem
896}
897
898/// Formats a limb slice into `buf` in the given radix (`2..=16`),
899/// returning the written digit subslice. The slice is treated as an
900/// unsigned magnitude.
901pub(crate) fn limbs_fmt_into<'a>(
902    limbs: &[u128],
903    radix: u128,
904    lower: bool,
905    buf: &'a mut [u8],
906) -> &'a str {
907    let digits: &[u8] = if lower {
908        b"0123456789abcdef"
909    } else {
910        b"0123456789ABCDEF"
911    };
912    if limbs_is_zero(limbs) {
913        let last = buf.len() - 1;
914        buf[last] = b'0';
915        return core::str::from_utf8(&buf[last..]).unwrap();
916    }
917    let mut work = [0u128; SCRATCH_LIMBS];
918    work[..limbs.len()].copy_from_slice(limbs);
919    let wl = limbs.len();
920    let mut pos = buf.len();
921    while !limbs_is_zero(&work[..wl]) {
922        let r = limbs_div_small(&mut work[..wl], radix);
923        pos -= 1;
924        buf[pos] = digits[r as usize];
925    }
926    core::str::from_utf8(&buf[pos..]).unwrap()
927}
928
929mod macros;
930use macros::decl_wide_int;
931
932
933/// Signed three-way comparison of two magnitude-limb slices given their
934/// signs. Returns `-1` / `0` / `1`.
935#[inline]
936pub(crate) const fn scmp(a_neg: bool, a: &[u128], b_neg: bool, b: &[u128]) -> i32 {
937    match (a_neg, b_neg) {
938        (true, false) => -1,
939        (false, true) => 1,
940        _ => limbs_cmp(a, b),
941    }
942}
943
944/// A signed integer that can be decomposed into a magnitude + sign and
945/// rebuilt from one — the basis of `wide_cast` (widen / narrow between
946/// any two widths, or between a wide integer and a primitive
947/// `i128` / `i64` / `u128`).
948pub(crate) trait WideInt: Copy {
949    /// Magnitude limbs (little-endian, zero-padded to 32) and sign.
950    fn to_mag_sign(self) -> ([u128; 64], bool);
951    /// Rebuilds from a magnitude limb slice and a sign, truncating the
952    /// magnitude to this type's width.
953    fn from_mag_sign(mag: &[u128], negative: bool) -> Self;
954}
955
956/// Implements `WideInt` for a signed primitive integer.
957macro_rules! impl_wideint_signed_prim {
958    ($($t:ty),*) => {$(
959        impl WideInt for $t {
960            #[inline]
961            fn to_mag_sign(self) -> ([u128; 64], bool) {
962                let mut out = [0u128; 64];
963                out[0] = self.unsigned_abs() as u128;
964                (out, self < 0)
965            }
966            #[inline]
967            fn from_mag_sign(mag: &[u128], negative: bool) -> $t {
968                let m = mag.first().copied().unwrap_or(0) as $t;
969                if negative { m.wrapping_neg() } else { m }
970            }
971        }
972    )*};
973}
974impl_wideint_signed_prim!(i8, i16, i32, i64, i128);
975
976impl WideInt for u128 {
977    #[inline]
978    fn to_mag_sign(self) -> ([u128; 64], bool) {
979        let mut out = [0u128; 64];
980        out[0] = self;
981        (out, false)
982    }
983    #[inline]
984    fn from_mag_sign(mag: &[u128], _negative: bool) -> u128 {
985        mag.first().copied().unwrap_or(0)
986    }
987}
988
989/// Widening / narrowing cast between any two `WideInt` types via a
990/// shared magnitude + sign representation.
991#[inline]
992pub(crate) fn wide_cast<S: WideInt, T: WideInt>(src: S) -> T {
993    let (mag, negative) = src.to_mag_sign();
994    T::from_mag_sign(&mag, negative)
995}
996
997// The concrete wide integer type pairs. The 256/512/1024-bit widths
998// back the wide decimal tiers; 2048/4096-bit widths are the
999// strict-transcendental work integers.
1000decl_wide_int!(Uint256, Int256, 2, 4);
1001decl_wide_int!(Uint384, Int384, 3, 6);
1002decl_wide_int!(Uint512, Int512, 4, 8);
1003decl_wide_int!(Uint768, Int768, 6, 12);
1004decl_wide_int!(Uint1024, Int1024, 8, 16);
1005decl_wide_int!(Uint1536, Int1536, 12, 24);
1006decl_wide_int!(Uint2048, Int2048, 16, 32);
1007decl_wide_int!(Uint3072, Int3072, 24, 48);
1008decl_wide_int!(Uint4096, Int4096, 32, 64);
1009decl_wide_int!(Uint6144, Int6144, 48, 96);
1010decl_wide_int!(Uint8192, Int8192, 64, 128);
1011
1012// Short aliases used by the decimal-tier macros (replacing the former
1013// `crate::wide` re-export shim). The signed alias is exposed at each
1014// width where it backs storage *or* serves as the next-width mul/div
1015// widening step; the unsigned alias only where `Display`'s magnitude
1016// path needs it. The feature gates mirror the call-site features.
1017#[cfg(any(feature = "d76", feature = "wide"))]
1018pub(crate) use self::{Int256 as I256, Uint256 as U256};
1019#[cfg(any(feature = "d76", feature = "d153", feature = "wide"))]
1020pub(crate) use self::Int512 as I512;
1021#[cfg(any(feature = "d153", feature = "wide"))]
1022pub(crate) use self::Uint512 as U512;
1023#[cfg(any(feature = "d153", feature = "d307", feature = "wide"))]
1024pub(crate) use self::Int1024 as I1024;
1025#[cfg(any(feature = "d307", feature = "wide"))]
1026pub(crate) use self::{Int2048 as I2048, Uint1024 as U1024};
1027
1028#[cfg(test)]
1029mod karatsuba_tests {
1030    use super::*;
1031
1032    /// Karatsuba and schoolbook must agree bit-for-bit on every
1033    /// equal-length input that meets the threshold.
1034    #[test]
1035    fn karatsuba_matches_schoolbook_at_n16() {
1036        let a: [u128; 16] = core::array::from_fn(|i| (i as u128) * 0xdead_beef + 1);
1037        let b: [u128; 16] = core::array::from_fn(|i| 0xcafe_babe ^ ((i as u128) << 5));
1038        let mut s = [0u128; 32];
1039        let mut k = [0u128; 32];
1040        limbs_mul(&a, &b, &mut s);
1041        limbs_mul_karatsuba(&a, &b, &mut k);
1042        assert_eq!(s, k);
1043    }
1044
1045    #[test]
1046    fn karatsuba_matches_schoolbook_at_n32() {
1047        let a: [u128; 32] = core::array::from_fn(|i| (i as u128).wrapping_mul(0x1234_5678_9abc));
1048        let b: [u128; 32] = core::array::from_fn(|i| (i as u128 + 1).wrapping_mul(0xfedc_ba98));
1049        let mut s = [0u128; 64];
1050        let mut k = [0u128; 64];
1051        limbs_mul(&a, &b, &mut s);
1052        limbs_mul_karatsuba(&a, &b, &mut k);
1053        assert_eq!(s, k);
1054    }
1055
1056    #[test]
1057    fn karatsuba_handles_zero_inputs() {
1058        let a = [0u128; 16];
1059        let b: [u128; 16] = core::array::from_fn(|i| (i as u128) + 1);
1060        let mut k = [0u128; 32];
1061        limbs_mul_karatsuba(&a, &b, &mut k);
1062        for o in &k {
1063            assert_eq!(*o, 0);
1064        }
1065    }
1066}
1067
1068#[cfg(test)]
1069mod hint_tests {
1070    use super::*;
1071
1072    #[test]
1073    fn signed_add_sub_neg() {
1074        let a = Int256::from_i128(5);
1075        let b = Int256::from_i128(3);
1076        assert_eq!(a.wrapping_add(b), Int256::from_i128(8));
1077        assert_eq!(a.wrapping_sub(b), Int256::from_i128(2));
1078        assert_eq!(b.wrapping_sub(a), Int256::from_i128(-2));
1079        assert_eq!(a.negate(), Int256::from_i128(-5));
1080        assert_eq!(Int256::ZERO.negate(), Int256::ZERO);
1081    }
1082
1083    #[test]
1084    fn signed_mul_div_rem() {
1085        let six = Int512::from_i128(6);
1086        let two = Int512::from_i128(2);
1087        let three = Int512::from_i128(3);
1088        assert_eq!(six.wrapping_mul(three), Int512::from_i128(18));
1089        assert_eq!(six.wrapping_div(two), three);
1090        assert_eq!(Int512::from_i128(7).wrapping_rem(three), Int512::from_i128(1));
1091        assert_eq!(Int512::from_i128(-7).wrapping_rem(three), Int512::from_i128(-1));
1092        assert_eq!(six.negate().wrapping_mul(three), Int512::from_i128(-18));
1093    }
1094
1095    #[test]
1096    fn checked_overflow() {
1097        assert_eq!(Int256::MAX.checked_add(Int256::ONE), None);
1098        assert_eq!(Int256::MIN.checked_sub(Int256::ONE), None);
1099        assert_eq!(Int256::MIN.checked_neg(), None);
1100        assert_eq!(
1101            Int256::from_i128(2).checked_add(Int256::from_i128(3)),
1102            Some(Int256::from_i128(5))
1103        );
1104    }
1105
1106    #[test]
1107    fn from_str_and_pow() {
1108        let ten = Int1024::from_str_radix("10", 10).unwrap();
1109        assert_eq!(ten, Int1024::from_i128(10));
1110        assert_eq!(ten.pow(3), Int1024::from_i128(1000));
1111        let big = Int1024::from_str_radix("10", 10).unwrap().pow(40);
1112        let from_str = Int1024::from_str_radix(
1113            "10000000000000000000000000000000000000000",
1114            10,
1115        )
1116        .unwrap();
1117        assert_eq!(big, from_str);
1118        assert_eq!(Int256::from_str_radix("-42", 10).unwrap(), Int256::from_i128(-42));
1119    }
1120
1121    #[test]
1122    fn ordering_and_resize() {
1123        assert!(Int256::from_i128(-1) < Int256::ZERO);
1124        assert!(Int256::MIN < Int256::MAX);
1125        let v = Int256::from_i128(-123_456_789);
1126        let wide: Int1024 = v.resize();
1127        let back: Int256 = wide.resize();
1128        assert_eq!(back, v);
1129        assert_eq!(wide, Int1024::from_i128(-123_456_789));
1130    }
1131
1132    #[test]
1133    fn isqrt_and_f64() {
1134        assert_eq!(Int512::from_i128(144).isqrt(), Int512::from_i128(12));
1135        assert_eq!(Int256::from_i128(1_000_000).as_f64(), 1_000_000.0);
1136        assert_eq!(Int256::from_f64(-2_500.0), Int256::from_i128(-2500));
1137    }
1138
1139    /// `Uint256` (the unsigned macro emission) supports the same
1140    /// bit/sign-manipulation surface as the signed sibling. Methods
1141    /// here are reachable through the wide decimal types but not always
1142    /// exercised by name; verify the contracts directly.
1143    #[test]
1144    fn uint256_is_zero_and_bit_helpers() {
1145        let zero = Uint256::ZERO;
1146        let one = Uint256::from_str_radix("1", 10).unwrap();
1147        let two = Uint256::from_str_radix("2", 10).unwrap();
1148        assert!(zero.is_zero());
1149        assert!(!one.is_zero());
1150        assert!(one.is_power_of_two());
1151        assert!(two.is_power_of_two());
1152        let three = Uint256::from_str_radix("3", 10).unwrap();
1153        assert!(!three.is_power_of_two());
1154        // next_power_of_two(0) == 1
1155        assert_eq!(zero.next_power_of_two(), one);
1156        // next_power_of_two(1) == 1 (already power of two)
1157        assert_eq!(one.next_power_of_two(), one);
1158        // next_power_of_two(3) == 4
1159        let four = Uint256::from_str_radix("4", 10).unwrap();
1160        assert_eq!(three.next_power_of_two(), four);
1161        // count_ones / leading_zeros
1162        assert_eq!(zero.count_ones(), 0);
1163        assert_eq!(one.count_ones(), 1);
1164        assert_eq!(zero.leading_zeros(), Uint256::BITS);
1165        assert_eq!(one.leading_zeros(), Uint256::BITS - 1);
1166    }
1167
1168    #[test]
1169    fn uint256_parse_arithmetic_and_pow() {
1170        // from_str_radix only accepts radix 10.
1171        assert!(Uint256::from_str_radix("10", 2).is_err());
1172        // Non-digit byte rejected.
1173        assert!(Uint256::from_str_radix("1a", 10).is_err());
1174        // Arithmetic: 3 - 2 = 1, 6 / 2 = 3, 7 % 3 = 1, 3·3 = 9.
1175        let two = Uint256::from_str_radix("2", 10).unwrap();
1176        let three = Uint256::from_str_radix("3", 10).unwrap();
1177        let six = Uint256::from_str_radix("6", 10).unwrap();
1178        let seven = Uint256::from_str_radix("7", 10).unwrap();
1179        assert_eq!(three - two, Uint256::from_str_radix("1", 10).unwrap());
1180        assert_eq!(six / two, three);
1181        assert_eq!(seven % three, Uint256::from_str_radix("1", 10).unwrap());
1182        // BitAnd / BitOr / BitXor
1183        let five = Uint256::from_str_radix("5", 10).unwrap();  // 101
1184        let four = Uint256::from_str_radix("4", 10).unwrap();  // 100
1185        let one = Uint256::from_str_radix("1", 10).unwrap();   // 001
1186        assert_eq!(five & four, four);                       // 100
1187        assert_eq!(five | one, five);                        // 101
1188        assert_eq!(five ^ four, one);                        // 001
1189        // pow: 2^10 = 1024
1190        let p10 = two.pow(10);
1191        assert_eq!(p10, Uint256::from_str_radix("1024", 10).unwrap());
1192        // cast_signed round-trip
1193        let signed = three.cast_signed();
1194        assert_eq!(signed, Int256::from_i128(3));
1195    }
1196
1197    /// `Int256::bit` reports the two's-complement bit at any index;
1198    /// indices past the storage width return the sign bit.
1199    #[test]
1200    fn signed_bit_and_trailing_zeros() {
1201        let v = Int256::from_i128(0b1100);
1202        assert!(v.bit(2));
1203        assert!(v.bit(3));
1204        assert!(!v.bit(0));
1205        assert!(!v.bit(1));
1206        // Out-of-range bit returns the sign — non-negative for v.
1207        assert!(!v.bit(1000));
1208        // Negative input: sign bit returns true past the storage.
1209        let n = Int256::from_i128(-1);
1210        assert!(n.bit(1000));
1211        // trailing_zeros
1212        assert_eq!(Int256::from_i128(8).trailing_zeros(), 3);
1213        assert_eq!(Int256::ZERO.trailing_zeros(), Int256::BITS);
1214    }
1215}
1216
1217#[cfg(test)]
1218mod slice_tests {
1219    use super::*;
1220
1221    #[test]
1222    fn mul_and_divmod_round_trip() {
1223        let a = [123u128, 7, 0, 0];
1224        let b = [456u128, 0, 0, 0];
1225        let mut prod = [0u128; 8];
1226        limbs_mul(&a, &b, &mut prod);
1227        let mut q = [0u128; 8];
1228        let mut r = [0u128; 8];
1229        limbs_divmod(&prod, &b, &mut q, &mut r);
1230        assert_eq!(&q[..4], &a, "quotient");
1231        assert!(limbs_is_zero(&r), "remainder");
1232    }
1233
1234    #[test]
1235    fn shifts() {
1236        let a = [1u128, 0];
1237        let mut out = [0u128; 2];
1238        limbs_shl(&a, 130, &mut out);
1239        assert_eq!(out, [0, 4]);
1240        let mut back = [0u128; 2];
1241        limbs_shr(&out, 130, &mut back);
1242        assert_eq!(back, [1, 0]);
1243    }
1244
1245    #[test]
1246    fn isqrt_basic() {
1247        let n = [0u128, 0, 1, 0];
1248        let mut out = [0u128; 4];
1249        limbs_isqrt(&n, &mut out);
1250        assert_eq!(out, [0, 1, 0, 0]);
1251        let n = [144u128, 0];
1252        let mut out = [0u128; 2];
1253        limbs_isqrt(&n, &mut out);
1254        assert_eq!(out, [12, 0]);
1255        let n = [2u128, 0];
1256        let mut out = [0u128; 2];
1257        limbs_isqrt(&n, &mut out);
1258        assert_eq!(out, [1, 0]);
1259    }
1260
1261    #[test]
1262    fn add_sub_carry() {
1263        let mut a = [u128::MAX, 0];
1264        let carry = limbs_add_assign(&mut a, &[1, 0]);
1265        assert!(!carry);
1266        assert_eq!(a, [0, 1]);
1267        let borrow = limbs_sub_assign(&mut a, &[1, 0]);
1268        assert!(!borrow);
1269        assert_eq!(a, [u128::MAX, 0]);
1270    }
1271
1272    /// `div_2_by_1` matches the obvious `(high·2^128 + low) / d`
1273    /// formula on representative inputs.
1274    #[test]
1275    fn div_2_by_1_basics() {
1276        // 1 / 1 = 1 r 0.
1277        assert_eq!(div_2_by_1(0, 1, 1), (1, 0));
1278        // 5 / 2 = 2 r 1.
1279        assert_eq!(div_2_by_1(0, 5, 2), (2, 1));
1280        // (3·2^128) / 4 = 3·2^126 r 0.
1281        assert_eq!(div_2_by_1(3, 0, 4), (3 << 126, 0));
1282        // High limb just under divisor — exercises the r_top overflow
1283        // recovery in the inner loop.
1284        let d = u128::MAX - 7;
1285        let (q, r) = div_2_by_1(d - 1, u128::MAX, d);
1286        // q · d + r == (d−1)·2^128 + (2^128 − 1)
1287        let (mul_hi, mul_lo) = mul_128(q, d);
1288        let (sum_lo, c) = mul_lo.overflowing_add(r);
1289        let sum_hi = mul_hi + c as u128;
1290        assert_eq!(sum_hi, d - 1);
1291        assert_eq!(sum_lo, u128::MAX);
1292        assert!(r < d);
1293    }
1294
1295    /// `limbs_divmod_knuth` agrees with the canonical `limbs_divmod`
1296    /// on a battery of representative shapes — single-limb divisors,
1297    /// multi-limb divisors, zero remainders, partial overflows in the
1298    /// q̂ refinement step.
1299    #[test]
1300    fn knuth_matches_canonical_divmod() {
1301        let cases: &[(&[u128], &[u128])] = &[
1302            // Simple
1303            (&[42], &[7]),
1304            (&[u128::MAX, 0], &[2]),
1305            // Multi-limb numerator, single-limb denominator.
1306            (&[1, 1, 0, 0], &[3]),
1307            // Multi-limb both — three-limb numerator by two-limb den.
1308            (&[u128::MAX, u128::MAX, 1, 0], &[5, 9]),
1309            // Three-limb both.
1310            (&[u128::MAX, u128::MAX, u128::MAX, 0], &[1, 2, 3]),
1311            // Numerator < denominator — quotient zero, remainder = num.
1312            (&[100, 0, 0], &[200, 0, 1]),
1313            // Equal high limbs (forces the u_top ≥ v_top branch).
1314            (
1315                &[0, 0, u128::MAX, u128::MAX],
1316                &[1, 2, u128::MAX],
1317            ),
1318        ];
1319        for (num, den) in cases {
1320            let mut q_canon = [0u128; 8];
1321            let mut r_canon = [0u128; 8];
1322            limbs_divmod(num, den, &mut q_canon, &mut r_canon);
1323            let mut q_knuth = [0u128; 8];
1324            let mut r_knuth = [0u128; 8];
1325            limbs_divmod_knuth(num, den, &mut q_knuth, &mut r_knuth);
1326            assert_eq!(q_canon, q_knuth, "quotient mismatch on {:?} / {:?}", num, den);
1327            assert_eq!(r_canon, r_knuth, "remainder mismatch on {:?} / {:?}", num, den);
1328        }
1329    }
1330
1331    /// `limbs_divmod_bz` agrees with the canonical path on
1332    /// medium-and-large operands. Recursion engages only above the
1333    /// `BZ_THRESHOLD = 8` limb cutoff.
1334    #[test]
1335    fn bz_matches_canonical_divmod() {
1336        // Builds a 16-limb dividend with a 10-limb divisor — well
1337        // above BZ_THRESHOLD so the recursive path is exercised.
1338        let mut num = [0u128; 16];
1339        for (i, slot) in num.iter_mut().enumerate() {
1340            *slot = (i as u128)
1341                .wrapping_mul(0x9E37_79B9_7F4A_7C15)
1342                .wrapping_add(i as u128);
1343        }
1344        let mut den = [0u128; 10];
1345        for (i, slot) in den.iter_mut().enumerate() {
1346            *slot = ((i + 1) as u128).wrapping_mul(0xBF58_476D_1CE4_E5B9);
1347        }
1348        let mut q_canon = [0u128; 16];
1349        let mut r_canon = [0u128; 16];
1350        limbs_divmod(&num, &den, &mut q_canon, &mut r_canon);
1351        let mut q_bz = [0u128; 16];
1352        let mut r_bz = [0u128; 16];
1353        limbs_divmod_bz(&num, &den, &mut q_bz, &mut r_bz);
1354        assert_eq!(q_canon, q_bz, "BZ quotient mismatch");
1355        assert_eq!(r_canon, r_bz, "BZ remainder mismatch");
1356    }
1357
1358    /// `limbs_mul_fast` dispatches to Karatsuba when both operands are
1359    /// equal-length ≥ `KARATSUBA_MIN`. Verify by comparing the result
1360    /// against schoolbook (`limbs_mul`) on a 16-limb pair.
1361    #[cfg(feature = "alloc")]
1362    #[test]
1363    fn fast_mul_dispatches_to_karatsuba_at_threshold() {
1364        let a: [u128; 16] = core::array::from_fn(|i| (i as u128).wrapping_mul(0xABCD) + 1);
1365        let b: [u128; 16] = core::array::from_fn(|i| (i as u128).wrapping_mul(0xBEEF) + 7);
1366        let mut fast = [0u128; 32];
1367        let mut school = [0u128; 32];
1368        limbs_mul_fast(&a, &b, &mut fast);
1369        limbs_mul(&a, &b, &mut school);
1370        assert_eq!(fast, school, "fast (Karatsuba) and schoolbook disagree");
1371    }
1372
1373    /// `limbs_mul_fast` falls through to schoolbook for unequal lengths
1374    /// or below the threshold. The 8-limb pair below skips Karatsuba.
1375    #[cfg(feature = "alloc")]
1376    #[test]
1377    fn fast_mul_falls_through_to_schoolbook_below_threshold() {
1378        let a: [u128; 8] = core::array::from_fn(|i| (i as u128).wrapping_mul(0x1234) + 1);
1379        let b: [u128; 8] = core::array::from_fn(|i| (i as u128).wrapping_mul(0x5678) + 3);
1380        let mut fast = [0u128; 16];
1381        let mut school = [0u128; 16];
1382        limbs_mul_fast(&a, &b, &mut fast);
1383        limbs_mul(&a, &b, &mut school);
1384        assert_eq!(fast, school);
1385    }
1386
1387    /// Karatsuba called directly below the threshold should still
1388    /// produce the correct product via its internal schoolbook
1389    /// fall-through. This exercises the safety branch in
1390    /// `limbs_mul_karatsuba` that zeros out and delegates back to
1391    /// schoolbook when `n < KARATSUBA_MIN`.
1392    #[cfg(feature = "alloc")]
1393    #[test]
1394    fn karatsuba_safety_fallback_below_threshold() {
1395        let a: [u128; 4] = [123, 456, 789, 0];
1396        let b: [u128; 4] = [987, 654, 321, 0];
1397        let mut karatsuba_out = [0u128; 8];
1398        let mut school_out = [0u128; 8];
1399        limbs_mul_karatsuba(&a, &b, &mut karatsuba_out);
1400        limbs_mul(&a, &b, &mut school_out);
1401        assert_eq!(karatsuba_out, school_out);
1402    }
1403
1404    /// `limbs_isqrt` of `1` returns `1` via the `bits <= 1` short-
1405    /// circuit.
1406    #[test]
1407    fn isqrt_one_short_circuit() {
1408        let n = [1u128, 0];
1409        let mut out = [0u128; 2];
1410        limbs_isqrt(&n, &mut out);
1411        assert_eq!(out, [1, 0]);
1412    }
1413
1414    /// `limbs_isqrt` of `0` returns `0` via the `bits == 0` short-
1415    /// circuit.
1416    #[test]
1417    fn isqrt_zero_short_circuit() {
1418        let n = [0u128, 0];
1419        let mut out = [0u128; 2];
1420        limbs_isqrt(&n, &mut out);
1421        assert_eq!(out, [0, 0]);
1422    }
1423
1424    /// `WideInt::from_mag_sign` for `u128` reads the first limb and
1425    /// ignores the sign flag. Exercised through a chained `wide_cast`
1426    /// `Int256 → u128`.
1427    #[test]
1428    fn wide_cast_into_u128_returns_first_limb() {
1429        let src = Int256::from_i128(123_456_789);
1430        let dst: u128 = wide_cast(src);
1431        assert_eq!(dst, 123_456_789);
1432        // Casting ZERO yields 0.
1433        let dst: u128 = wide_cast(Int256::ZERO);
1434        assert_eq!(dst, 0);
1435    }
1436
1437    /// Knuth's q̂-cap path fires when `u_top >= v_top` in the
1438    /// per-quotient-limb loop. We engineer a dividend whose normalised
1439    /// top limb equals the normalised divisor top so the cap (`q̂ =
1440    /// u128::MAX`, plus the subsequent multiply-subtract correction)
1441    /// runs, then verify the resulting quotient matches the canonical
1442    /// `limbs_divmod`.
1443    #[test]
1444    fn knuth_q_hat_cap_branch_matches_canonical() {
1445        // num top limb == den top limb; div quotient's first chunk hits
1446        // the cap. Picking the divisor's top close to u128::MAX
1447        // tightens the normalisation shift.
1448        let num: [u128; 4] = [0, 0, u128::MAX, u128::MAX >> 1];
1449        let den: [u128; 3] = [1, 2, u128::MAX >> 1];
1450        let mut q_canon = [0u128; 4];
1451        let mut r_canon = [0u128; 4];
1452        limbs_divmod(&num, &den, &mut q_canon, &mut r_canon);
1453        let mut q_knuth = [0u128; 4];
1454        let mut r_knuth = [0u128; 4];
1455        limbs_divmod_knuth(&num, &den, &mut q_knuth, &mut r_knuth);
1456        assert_eq!(q_canon, q_knuth);
1457        assert_eq!(r_canon, r_knuth);
1458    }
1459
1460    /// `limbs_divmod_bz` with a numerator that has trailing zero limbs
1461    /// strips them off in its top-non-zero scan before deciding whether
1462    /// to recurse.
1463    #[test]
1464    fn bz_strips_numerator_trailing_zeros() {
1465        // 16-limb buffer but only the low half is non-zero; den is 10 limbs.
1466        // BZ should recognise top < 2*n and fall back to Knuth.
1467        let mut num = [0u128; 16];
1468        for slot in &mut num[..8] {
1469            *slot = 0xCAFE_F00D;
1470        }
1471        let mut den = [0u128; 10];
1472        den[0] = 7;
1473        let mut q_canon = [0u128; 16];
1474        let mut r_canon = [0u128; 16];
1475        limbs_divmod(&num, &den, &mut q_canon, &mut r_canon);
1476        let mut q_bz = [0u128; 16];
1477        let mut r_bz = [0u128; 16];
1478        limbs_divmod_bz(&num, &den, &mut q_bz, &mut r_bz);
1479        assert_eq!(q_canon, q_bz);
1480        assert_eq!(r_canon, r_bz);
1481    }
1482}