1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
// Copyright © 2026 Mikhail Hogrefe
//
// Uses code adopted from the GNU MPFR Library.
//
// Copyright 2008-2024 Free Software Foundation, Inc.
//
// Contributed by the AriC and Caramba projects, INRIA.
//
// This file is part of Malachite.
//
// Malachite is free software: you can redistribute it and/or modify it under the terms of the GNU
// Lesser General Public License (LGPL) as published by the Free Software Foundation; either version
// 3 of the License, or (at your option) any later version. See <https://www.gnu.org/licenses/>.
use crate::natural::arithmetic::add::{
limbs_slice_add_greater_in_place_left, limbs_slice_add_limb_in_place,
};
use crate::natural::arithmetic::mul::{
limbs_mul_greater_to_out, limbs_mul_greater_to_out_scratch_len,
};
use crate::natural::arithmetic::shl::limbs_slice_shl_in_place;
use crate::natural::arithmetic::square::{limbs_square_to_out, limbs_square_to_out_scratch_len};
use crate::natural::arithmetic::sub::{limbs_sub_greater_in_place_left, limbs_sub_limb_in_place};
use crate::natural::logic::not::{limbs_not_in_place, limbs_not_to_out};
use crate::natural::{
LIMB_HIGH_BIT, bit_to_limb_count_ceiling, bit_to_limb_count_floor, limb_to_bit_count,
};
use crate::platform::Limb;
use malachite_base::num::arithmetic::traits::{PowerOf2, Square, XMulYToZZ};
use malachite_base::num::basic::integers::PrimitiveInt;
use malachite_base::num::conversion::traits::ExactFrom;
// the following T1 and T2 are bipartite tables giving initial approximation for the inverse square
// root, with 13-bit input split in 5 + 4 + 4, and 11-bit output. More precisely, if 2048 <= i <
// 8192, with i = a * 2 ^ 8 + b * 2 ^ 4 + c, we use for approximation of 2048 / sqrt(i / 2048) the
// value x = T1[16 * (a - 8) + b] + T2[16 * (a - 8) + c]. The largest error is obtained for i =
// 2054, where x = 2044, and 2048 / sqrt(i / 2048) = 2045.006576...
//
// This is T1 from rec_sqrt.c, MPFR 4.3.0.
const T1: [u16; 384] = [
2040, 2033, 2025, 2017, 2009, 2002, 1994, 1987, 1980, 1972, 1965, 1958, 1951, 1944, 1938, 1931,
1925, 1918, 1912, 1905, 1899, 1892, 1886, 1880, 1874, 1867, 1861, 1855, 1849, 1844, 1838, 1832,
1827, 1821, 1815, 1810, 1804, 1799, 1793, 1788, 1783, 1777, 1772, 1767, 1762, 1757, 1752, 1747,
1742, 1737, 1733, 1728, 1723, 1718, 1713, 1709, 1704, 1699, 1695, 1690, 1686, 1681, 1677, 1673,
1669, 1664, 1660, 1656, 1652, 1647, 1643, 1639, 1635, 1631, 1627, 1623, 1619, 1615, 1611, 1607,
1603, 1600, 1596, 1592, 1588, 1585, 1581, 1577, 1574, 1570, 1566, 1563, 1559, 1556, 1552, 1549,
1545, 1542, 1538, 1535, 1532, 1528, 1525, 1522, 1518, 1515, 1512, 1509, 1505, 1502, 1499, 1496,
1493, 1490, 1487, 1484, 1481, 1478, 1475, 1472, 1469, 1466, 1463, 1460, 1457, 1454, 1451, 1449,
1446, 1443, 1440, 1438, 1435, 1432, 1429, 1427, 1424, 1421, 1419, 1416, 1413, 1411, 1408, 1405,
1403, 1400, 1398, 1395, 1393, 1390, 1388, 1385, 1383, 1380, 1378, 1375, 1373, 1371, 1368, 1366,
1363, 1360, 1358, 1356, 1353, 1351, 1349, 1346, 1344, 1342, 1340, 1337, 1335, 1333, 1331, 1329,
1327, 1325, 1323, 1321, 1319, 1316, 1314, 1312, 1310, 1308, 1306, 1304, 1302, 1300, 1298, 1296,
1294, 1292, 1290, 1288, 1286, 1284, 1282, 1280, 1278, 1276, 1274, 1272, 1270, 1268, 1266, 1265,
1263, 1261, 1259, 1257, 1255, 1253, 1251, 1250, 1248, 1246, 1244, 1242, 1241, 1239, 1237, 1235,
1234, 1232, 1230, 1229, 1227, 1225, 1223, 1222, 1220, 1218, 1217, 1215, 1213, 1212, 1210, 1208,
1206, 1204, 1203, 1201, 1199, 1198, 1196, 1195, 1193, 1191, 1190, 1188, 1187, 1185, 1184, 1182,
1181, 1180, 1178, 1177, 1175, 1174, 1172, 1171, 1169, 1168, 1166, 1165, 1163, 1162, 1160, 1159,
1157, 1156, 1154, 1153, 1151, 1150, 1149, 1147, 1146, 1144, 1143, 1142, 1140, 1139, 1137, 1136,
1135, 1133, 1132, 1131, 1129, 1128, 1127, 1125, 1124, 1123, 1121, 1120, 1119, 1117, 1116, 1115,
1114, 1113, 1111, 1110, 1109, 1108, 1106, 1105, 1104, 1103, 1101, 1100, 1099, 1098, 1096, 1095,
1093, 1092, 1091, 1090, 1089, 1087, 1086, 1085, 1084, 1083, 1081, 1080, 1079, 1078, 1077, 1076,
1075, 1073, 1072, 1071, 1070, 1069, 1068, 1067, 1065, 1064, 1063, 1062, 1061, 1060, 1059, 1058,
1057, 1056, 1055, 1054, 1052, 1051, 1050, 1049, 1048, 1047, 1046, 1045, 1044, 1043, 1042, 1041,
1040, 1039, 1038, 1037, 1036, 1035, 1034, 1033, 1032, 1031, 1030, 1029, 1028, 1027, 1026, 1025,
];
// This is T2 from rec_sqrt.c, MPFR 4.3.0.
const T2: [u8; 384] = [
7, 7, 6, 6, 5, 5, 4, 4, 4, 3, 3, 2, 2, 1, 1, 0, 6, 5, 5, 5, 4, 4, 3, 3, 3, 2, 2, 2, 1, 1, 0, 0,
5, 5, 4, 4, 4, 3, 3, 3, 2, 2, 2, 1, 1, 1, 0, 0, 4, 4, 3, 3, 3, 3, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0,
3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0,
3, 3, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
3, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
];
// Put in X a p-bit approximation of 1/sqrt(A), where X = {x, n} / B ^ n, n = ceil(p / Limb::WIDTH),
// A = 2 ^ (1 + a_s) * {a, an} / B ^ an, a_s is 0 or 1, an = ceil(ap / Limb::WIDTH), where B = 2 ^
// Limb::WIDTH.
//
// We have 1 <= A < 4 and 1/2 <= X < 1.
//
// The error in the approximate result with respect to the true value 1/sqrt(A) is bounded by 1
// ulp(X), i.e., 2^{-p} since 1/2 <= X < 1.
//
// Note: x and a are left-aligned, i.e., the most significant bit of a[an-1] is set, and so is the
// most significant bit of the output x[n-1].
//
// If p is not a multiple of Limb::WIDTH, the extra low bits of the input A are taken into account
// to compute the approximation of 1/sqrt(A), but whether or not they are zero, the error between X
// and 1/sqrt(A) is bounded by 1 ulp(X) [in precision p]. The extra low bits of the output X (if p
// is not a multiple of Limb::WIDTH) are set to 0.
//
// Assumptions:
// - A should be normalized, i.e., the most significant bit of a[an-1] should be 1. If a_s = 0, we
// have 1 <= A < 2; if a_s = 1, we have 2 <= A < 4.
// - p >= 12
// - {a, an} and {x, n} should not overlap
// - Limb::WIDTH >= 12 and is even
//
// Note: this routine is much more efficient when ap is small compared to p, including the case
// where ap <= Limb::WIDTH, thus it can be used to implement an efficient mpfr_rec_sqrt_ui function.
//
// References: [1] Modern Computer Algebra, Richard Brent and Paul Zimmermann,
// https://members.loria.fr/PZimmermann/mca/pub226.html
//
// This is mpfr_mpn_rec_sqrt from rec_sqrt.c, MPFR 4.3.0.
pub fn limbs_reciprocal_sqrt(
out: &mut [Limb],
out_prec: u64,
mut xs: &[Limb],
x_prec: u64,
parity: bool,
) {
let out_len = bit_to_limb_count_ceiling(out_prec);
let out = &mut out[..out_len];
let mut xs_len = bit_to_limb_count_ceiling(x_prec);
// A should be normalized
assert!(xs[xs_len - 1].get_highest_bit());
assert!(out_prec >= 11);
if xs_len > out_len {
// we can cut the input to n limbs
xs = &xs[xs_len - out_len..];
xs_len = out_len;
}
if out_prec == 11 {
// should happen only from recursive calls
//
// take the 12 + a_s most significant bits of A
let i =
usize::exact_from(xs[xs_len - 1] >> (const { Limb::WIDTH - 12 } - u64::from(parity)));
let ab = i >> 4;
let ac = (ab & 0x3f0) | (i & 0xf);
let t = T1[ab - 0x80] + u16::from(T2[ac - 0x80]); // fits on 16 bits
// x has only one limb
out[0] = Limb::from(t) << (Limb::WIDTH - out_prec);
} else {
// p >= 12
//
// compared to Algorithm 3.9 of [1], we have {a, an} = A / 2 if a_s = 0, and A / 4 if a_s =
// 1.
//
// h = max(11, ceil((p + 3) / 2)) is the bitsize of the recursive call
let h = if out_prec < 18 {
11
} else {
(out_prec >> 1) + 2
};
// limb size of the recursive Xh
let xs_rec_len = bit_to_limb_count_ceiling(h);
// a priori limb size of Xh^2
let rn = bit_to_limb_count_ceiling(h << 1);
let ln = out_len - xs_rec_len; // remaining limbs to be computed
// Since |Xh - A ^ {-1 / 2}| <= 2 ^ {-h}, then by multiplying by Xh + A ^ {-1 / 2} we get
// |Xh ^ 2 - 1 / A| <= 2 ^ {-h + 1}, thus |A * Xh ^ 2 - 1| <= 2 ^ {-h + 3}, thus the h-3
// most significant bits of t should be zero, which is in fact h + 1 + a_s - 3 because of
// the normalization of A. This corresponds to th=floor((h + 1 + a_s - 3) / Limb::WIDTH)
// limbs.
//
// More precisely we have |Xh ^ 2 - 1 / A| <= 2^{-h} * (Xh + A^{-1 / 2}) <= 2^{-h} * (2
// A^{-1 / 2} + 2^{-h}) <= 2.001 * 2^{-h} * A^{-1 / 2} since A < 4 and h >= 11, thus |A * Xh
// ^ 2 - 1| <= 2.001 * 2^{-h} * A^{1 / 2} <= 1.001 * 2 ^ {2 - h}. This is sufficient to
// prove that the upper limb of {t,tn} below is less that 0.501 * 2 ^ Limb::WIDTH, thus cu =
// 0 below.
let hp = h + 1 + u64::from(parity);
let th = bit_to_limb_count_floor(hp - 3);
let mut ts_len = bit_to_limb_count_ceiling(hp + h);
// we need h + 1 + a_s bits of a
//
// number of high limbs of A needed for the recursive call
let mut ahn = bit_to_limb_count_ceiling(hp);
if ahn > xs_len {
ahn = xs_len;
}
let (out_lo, out_hi) = out.split_at_mut(ln);
limbs_reciprocal_sqrt(
out_hi,
h,
&xs[xs_len - ahn..],
limb_to_bit_count(ahn),
parity,
);
// the most h significant bits of X are set, X has ceil(h / Limb::WIDTH) limbs, the low (-h)
// % Limb::WIDTH bits are zero
//
// compared to Algorithm 3.9 of [1], we have {x+ln,xn} = X_h
//
// first step: square X in r, result is exact
let mut us_len = xs_rec_len + (ts_len - th);
// We use the same temporary buffer to store r and u: r needs 2*xn limbs where u needs
// xn+(tn-th) limbs. Since tn can store at least 2h bits, and th at most h bits, then tn-th
// can store at least h bits, thus tn - th >= xn, and reserving the space for u is enough.
assert!(xs_rec_len << 1 <= us_len);
let sn = xs_len + rn;
let mut scratch = vec![0; (us_len << 1) + sn];
split_into_chunks_mut!(scratch, us_len, [us, rs], ss);
let mut rs = rs;
if h << 1 <= Limb::WIDTH {
// xn=rn=1, and since p <= 2h-3, n=1, thus ln = 0
assert_eq!(ln, 0);
let cy = out_hi[0] >> const { Limb::WIDTH >> 1 };
rs = &mut rs[1..];
rs[0] = cy.square();
} else if xs_rec_len == 1 {
// xn = 1, rn = 2
(rs[1], rs[0]) = Limb::x_mul_y_to_zz(out_hi[0], out_hi[0]);
} else {
let mut scratch = vec![0; limbs_square_to_out_scratch_len(xs_rec_len)];
limbs_square_to_out(rs, out_hi, &mut scratch);
// we have {r, 2 * xn} = X_h ^ 2
if rn < xs_rec_len << 1 {
rs = &mut rs[1..];
}
}
// now the 2h most significant bits of {r, rn} contains X ^ 2, r has rn limbs, and the low
// (-2h) % Limb::WIDTH bits are zero
//
// Second step: s <- A * (r ^ 2), and truncate the low ap bits, i.e., at weight 2 ^ {-2h} (s
// is aligned to the low significant bits)
if rn == 1 {
// rn=1 implies n=1, since rn*Limb::WIDTH >= 2h, and 2h >= p + 3
//
// necessarily p <= Limb::WIDTH-3: we can ignore the two low bits from A
//
// since n=1, and we ensured an <= n, we also have an=1
assert_eq!(xs_len, 1);
(ss[1], ss[0]) = Limb::x_mul_y_to_zz(rs[0], xs[0]);
} else {
// we have p <= n * Limb::WIDTH 2h <= rn * Limb::WIDTH with p+3 <= 2h <= p+4 thus n <=
// rn <= n + 1
assert!(rn <= out_len + 1);
// since we ensured an <= n, we have an <= rn
assert!(xs_len <= rn);
let mut scratch = vec![0; limbs_mul_greater_to_out_scratch_len(rn, xs_len)];
limbs_mul_greater_to_out(ss, &rs[..rn], xs, &mut scratch);
// s should be near B ^ sn / 2 ^ (1 + a_s), thus s[sn-1] is either
// - 100000... or 011111... if a_s = 0, or
// - 010000... or 001111... if a_s = 1.
// We ignore the bits of s after the first 2h + 1 + a_s ones. We have {s, rn+an} = A *
// X_h ^ 2 / 2 if a_s = 0, A * X_h ^ 2 / 4 if a_s = 1.
}
// We ignore the bits of s after the first 2h + 1 + a_s ones: s has rn + an limbs, where rn
// = LIMBS(2h), an = LIMBS(a), and tn = LIMBS(2h + 1 + a_s).
let ts = &mut ss[sn - ts_len..]; // pointer to low limb of the high part of t
// the upper h-3 bits of 1 - t should be zero, where 1 corresponds to the most significant
// bit of t[tn - 1] if a_s = 0, and to the 2nd most significant bit of t[tn - 1] if a_s = 1
//
// compute t <- 1 - t, which is B^tn - {t, tn + 1}, with rounding toward -Inf, i.e.,
// rounding the input t toward +Inf. We could only modify the low tn - th limbs from t, but
// it gives only a small speedup, and would make the code more complex.
let bit = if parity {
const { LIMB_HIGH_BIT >> 1 }
} else {
LIMB_HIGH_BIT
};
let ts_last = ts.last_mut().unwrap();
let neg = *ts_last & bit;
if neg == 0 {
// Ax ^ 2 < 1: we have t = th + eps, where 0 <= eps < ulp(th) is the part truncated
// above, thus 1 - t rounded to -Inf is 1 - th - ulp(th)
//
// since the 1 + a_s most significant bits of t are zero, set them to 1 before the
// one-complement
*ts_last |= LIMB_HIGH_BIT | bit;
limbs_not_in_place(ts);
// we should add 1 here to get 1-th complement, and subtract 1 for -ulp(th), thus we do
// nothing
} else {
// negative case: we want 1 - t rounded toward -Inf, i.e., th + eps rounded toward +Inf,
// which is th + ulp(th): we discard the bit corresponding to 1, and we add 1 to the
// least significant bit of t
*ts_last ^= neg;
limbs_slice_add_limb_in_place(ts, 1);
}
// we know at least th = floor((h + 1 + a_s - 3) / Limb::WIDTH) of the high limbs of {t, tn}
// are zero
ts_len -= th;
// tn = rn - th, where rn * Limb::WIDTH >= 2 * h and th * Limb::WIDTH <= h + 1 + a_s - 3,
// thus tn > 0
assert_ne!(ts_len, 0);
// u <- x * t, where {t, tn} contains at least h + 3 bits, and {x, xn} contains h bits, thus
// tn >= xn
assert!(ts_len >= xs_rec_len);
if ts_len == 1 {
// necessarily xn = 1
(us[1], us[0]) = Limb::x_mul_y_to_zz(ts[0], out_hi[0]);
} else {
let mut scratch = vec![0; limbs_mul_greater_to_out_scratch_len(ts_len, xs_rec_len)];
limbs_mul_greater_to_out(us, &ts[..ts_len], out_hi, &mut scratch);
}
// we have {u, tn+xn} = T_l X_h / 2 if a_s = 0, T_l X_h / 4 if a_s = 1
//
// we have already discarded the upper th high limbs of t, thus we only have to consider the
// upper n - th limbs of u
us_len = out_len - th;
// un cannot be zero, since p <= n * Limb::WIDTH, h = ceil((p + 3) / 2) <= (p + 4) / 2, th *
// Limb::WIDTH <= h - 1 <= p / 2 + 1, thus (n - th) * Limb::WIDTH >= p / 2 - 1.
assert_ne!(us_len, 0);
let u_offset = ts_len + xs_rec_len - us_len;
// xn + tn - un = xn + (original_tn - th) - (n - th) = xn + original_tn - n = LIMBS(h) +
// LIMBS(2h + 1 + a_s) - LIMBS(p) > 0 since 2h >= p + 3
assert_ne!(u_offset, 0); // will allow to access u[-1] below
// In case a_s = 0, u contains |x * (1 - Ax ^ 2) / 2|, which is exactly what we need to add
// or subtract. In case a_s = 1, u contains |x * (1 - Ax ^ 2) / 4|, thus we need to multiply
// u by 2.
if parity {
// shift on un+1 limbs to get most significant bit of u[-1] into least significant bit
// of u[0]
limbs_slice_shl_in_place(&mut us[u_offset - 1..u_offset + us_len], 1);
}
// now {u,un} represents U / 2 from Algorithm 3.9
let shift_bit = Limb::power_of_2(limb_to_bit_count(out_len) - out_prec);
// We want that the low pl bits are zero after rounding to nearest, thus we round u to
// nearest at bit pl - 1 of u[0]
let (us_head, us_tail) = us[u_offset - 1..].split_first_mut().unwrap();
let us_tail = &mut us_tail[..us_len];
let cu = if shift_bit == 1 {
// round bit is in u[-1]
limbs_slice_add_limb_in_place(us_tail, *us_head >> const { Limb::WIDTH - 1 })
} else {
let uu = us_tail[0];
let cu = limbs_slice_add_limb_in_place(us_tail, uu & (shift_bit >> 1));
// mask bits 0..pl - 1 of u[0]
us_tail[0] &= !(shift_bit - 1);
cu
};
assert!(!cu);
// We already have filled {x + ln, xn = n - ln}, and we want to add or subtract {u, un} at
// position x.
// - un = n - th, where th contains <= h + 1 + a_s - 3 <= h - 1 bits
// - ln = n - xn, where xn contains >= h bits
// - thus un > ln.
// Warning: ln might be zero.
assert!(us_len > ln);
// we can have un = ln + 2, for example with Limb::WIDTH = 32 and p = 62, a_s = 0, then h =
// 33, n = 2, th = 0, xn = 2, thus un = 2 and ln = 0.
assert!(us_len == ln + 1 || us_len == ln + 2);
// the high un-ln limbs of u will overlap the low part of {x+ln,xn}, we need to add or
// subtract the overlapping part {u + ln, un - ln}
//
// Warning! th may be 0, in which case the mpn_add_1 and mpn_sub_1 below (with size = th)
// mustn't be used.
let mut carry;
let (us_lo, us_hi) = us_tail.split_at_mut(ln);
if neg == 0 {
if ln != 0 {
out_lo.copy_from_slice(us_lo);
}
carry = limbs_slice_add_greater_in_place_left(out_hi, us_hi);
// cy is the carry at x + (ln + xn) = x + n
} else {
// negative case
//
// subtract {u+ln, un-ln} from {x+ln,un}
carry = limbs_sub_greater_in_place_left(out_hi, us_hi);
// cy is the borrow at x + (ln + xn) = x + n
//
// cy cannot be non-zero, since the most significant bit of Xh is 1, and the correction
// is bounded by 2^{-h+3}
assert!(!carry);
if ln != 0 {
limbs_not_to_out(out, us_lo);
// we must add one for the 2-complement
carry = limbs_slice_add_limb_in_place(out, 1);
// ... and subtract 1 at x[ln], where n = ln + xn
if limbs_sub_limb_in_place(&mut out[ln..], 1) {
assert!(carry);
carry = false;
}
}
}
// cy can be 1 when A=1, i.e., {a, n} = B ^ n. In that case we should have X = B ^ n, and
// setting X to 1-2^{-p} satisfies the error bound of 1 ulp.
if carry {
assert!(limbs_sub_limb_in_place(out, shift_bit));
}
}
}