1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
//! Single Euclidean step using an approximate u64 quotient with correction.
use super::GcdState;
use crate::u320::U320;
impl GcdState {
/// Performs one Euclidean step using an approximate u64 quotient,
/// correcting it to the exact value via U320 comparison loops.
///
/// Called when the Lehmer loop fails to batch (identity matrix), meaning
/// the first quotient is too large for matrix accumulation but `r1`'s
/// top-bit approximation is non-zero (bit-length gap < 64). The exact
/// quotient is guaranteed to fit in u64 since `b != 0` implies the
/// remainders differ by at most ~63 bits.
///
/// The approximate quotient `q = a / b` (from 63-bit top-bit extraction)
/// may over- or under-estimate the true quotient by a small amount due
/// to truncation of lower bits. Two correction loops adjust `q`:
///
/// 1. **Over-estimation**: while `q × r1 > r0`, decrease `q` (subtract
/// `r1` from the running product).
/// 2. **Under-estimation**: while `remainder ≥ r1`, increase `q`
/// (subtract `r1` from the remainder).
///
/// The coefficient update uses the cheap `U256 × u64 → U320` multiply
/// (4 limb multiplications) instead of the full `U256 × U256` multiply
/// (16 limb multiplications) used by [`full_step`](Self::full_step).
#[inline]
pub fn scalar_step(&mut self, mut q: u64) {
let r0_wide = U320::from(self.r0);
let r1_wide = U320::from(self.r1);
let mut qr1 = self.r1 * q;
// Over-estimated? Decrease q until q*r1 <= r0.
while qr1 > r0_wide {
qr1 = qr1.overflowing_sub(&r1_wide).0;
q -= 1;
}
// rem = r0 - q*r1 (non-negative)
let mut rem = r0_wide.overflowing_sub(&qr1).0;
// Under-estimated? Increase q until rem < r1.
while rem >= r1_wide {
rem = rem.overflowing_sub(&r1_wide).0;
q += 1;
}
self.r0 = self.r1;
self.r1 = rem.low_u256();
// Coefficient update: new_t = t0 - q * t1 (wrapping 256-bit)
let qt1 = (self.t1 * q).low_u256();
let new_t = self.t0.overflowing_sub(&qt1).0;
self.t0 = self.t1;
self.t1 = new_t;
self.even = !self.even;
}
}
#[cfg(test)]
mod ai_tests {
use crate::u256::U256;
use super::*;
/// Single scalar step on small values: gcd(100, 37), q=2.
#[test]
fn small_values() {
let mut state = GcdState::new(
U256::from_be_limbs([0, 0, 0, 100]),
U256::from_be_limbs([0, 0, 0, 37]),
);
state.scalar_step(2);
assert_eq!(state.r0, U256::from_be_limbs([0, 0, 0, 37]));
assert_eq!(state.r1, U256::from_be_limbs([0, 0, 0, 26]));
assert!(!state.even);
}
/// Exact quotient: corrects from approximate q=3 down to 2.
#[test]
fn corrects_overestimate() {
let mut state = GcdState::new(
U256::from_be_limbs([0, 0, 0, 100]),
U256::from_be_limbs([0, 0, 0, 37]),
);
// True quotient is 2, but we pass 3 (over-estimate by 1).
state.scalar_step(3);
assert_eq!(state.r0, U256::from_be_limbs([0, 0, 0, 37]));
assert_eq!(state.r1, U256::from_be_limbs([0, 0, 0, 26]));
}
/// Exact quotient: corrects from approximate q=1 up to 2.
#[test]
fn corrects_underestimate() {
let mut state = GcdState::new(
U256::from_be_limbs([0, 0, 0, 100]),
U256::from_be_limbs([0, 0, 0, 37]),
);
// True quotient is 2, but we pass 1 (under-estimate by 1).
state.scalar_step(1);
assert_eq!(state.r0, U256::from_be_limbs([0, 0, 0, 37]));
assert_eq!(state.r1, U256::from_be_limbs([0, 0, 0, 26]));
}
/// Parity toggles on each scalar step.
#[test]
fn parity_toggles() {
let mut state = GcdState::new(
U256::from_be_limbs([0, 0, 0, 100]),
U256::from_be_limbs([0, 0, 0, 37]),
);
assert!(state.even);
state.scalar_step(2);
assert!(!state.even);
}
/// Coefficient tracking: t0 and t1 advance correctly.
#[test]
fn coefficient_update() {
let mut state = GcdState::new(
U256::from_be_limbs([0, 0, 0, 100]),
U256::from_be_limbs([0, 0, 0, 37]),
);
// Initially t0=0, t1=1. After step: q=2.
// new_t = t0 - q*t1 = 0 - 2*1 = -2 (wrapping)
state.scalar_step(2);
assert_eq!(state.t0, U256::ONE);
let neg2 = U256::ZERO
.overflowing_sub(&U256::from_be_limbs([0, 0, 0, 2]))
.0;
assert_eq!(state.t1, neg2);
}
/// Large correction: approximate quotient off by ~32 still converges.
#[test]
fn large_correction() {
let r0 = U256::from_be_limbs([0, 0, 0, 1 << 48]);
let r1 = U256::from_be_limbs([0, 0, 0, 1 << 15]);
let mut state = GcdState::new(r0, r1);
// True quotient: (1<<48) / (1<<15) = 1<<33 = 8589934592
// Pass an approximate quotient off by 30.
let approx_q = (1u64 << 33) + 30;
state.scalar_step(approx_q);
assert_eq!(state.r0, U256::from_be_limbs([0, 0, 0, 1 << 15]));
assert_eq!(state.r1, U256::ZERO);
}
/// Regression: the input that triggered the original mod_inv bug.
#[test]
fn regression_input() {
let p = U256::from_be_limbs([
0xFFFFFFFFFFFFFFFF,
0xFFFFFFFFFFFFFFFF,
0xFFFFFFFFFFFFFFFF,
0xFFFFFFFEFFFFFC2F,
]);
let dx = U256::from_be_limbs([
16861361263094501749,
6454130256144472189,
7904638317253979306,
10596543604346685278,
]);
let state = GcdState::new(p, dx);
let inv = state.run().inverse();
assert!(inv.is_some());
assert_eq!(dx.mul_mod(&inv.unwrap(), &p), U256::ONE);
}
}