1use crate::natural::InnerNatural::{Large, Small};
30use crate::natural::Natural;
31use crate::natural::arithmetic::add::{
32 limbs_add_limb_to_out, limbs_add_same_length_to_out, limbs_add_to_out,
33 limbs_slice_add_greater_in_place_left, limbs_slice_add_limb_in_place,
34 limbs_slice_add_same_length_in_place_left,
35};
36use crate::natural::arithmetic::add_mul::limbs_slice_add_mul_limb_same_length_in_place_left;
37use crate::natural::arithmetic::mul::fft::mpn_square_default_mpn_ctx;
38use crate::natural::arithmetic::mul::limb::limbs_mul_limb_to_out;
39use crate::natural::arithmetic::mul::limbs_mul_greater_to_out_basecase;
40use crate::natural::arithmetic::mul::poly_eval::{
41 limbs_mul_toom_evaluate_deg_3_poly_in_1_and_neg_1,
42 limbs_mul_toom_evaluate_deg_3_poly_in_2_and_neg_2, limbs_mul_toom_evaluate_poly_in_1_and_neg_1,
43 limbs_mul_toom_evaluate_poly_in_2_and_neg_2,
44 limbs_mul_toom_evaluate_poly_in_2_pow_and_neg_2_pow,
45 limbs_mul_toom_evaluate_poly_in_2_pow_neg_and_neg_2_pow_neg,
46};
47use crate::natural::arithmetic::mul::poly_interpolate::{
48 limbs_mul_toom_interpolate_5_points, limbs_mul_toom_interpolate_7_points,
49 limbs_mul_toom_interpolate_12_points, limbs_mul_toom_interpolate_16_points,
50};
51use crate::natural::arithmetic::mul::toom::{
52 BIT_CORRECTION, TUNE_PROGRAM_BUILD, WANT_FAT_BINARY, limbs_toom_couple_handling,
53};
54use crate::natural::arithmetic::shl::{limbs_shl_to_out, limbs_slice_shl_in_place};
55use crate::natural::arithmetic::sub::{
56 limbs_sub_limb_in_place, limbs_sub_same_length_in_place_left, limbs_sub_same_length_to_out,
57};
58use crate::natural::comparison::cmp::limbs_cmp_same_length;
59use crate::platform::{
60 DoubleLimb, Limb, SQR_BASECASE_THRESHOLD, SQR_TOOM2_THRESHOLD, SQR_TOOM3_THRESHOLD,
61 SQR_TOOM4_THRESHOLD, SQR_TOOM6_THRESHOLD, SQR_TOOM8_THRESHOLD,
62};
63use alloc::vec::Vec;
64use core::cmp::{Ordering::*, max};
65use malachite_base::fail_on_untested_path;
66use malachite_base::num::arithmetic::traits::{
67 ArithmeticCheckedShl, DivRound, ShrRound, Square, SquareAssign, WrappingAddAssign,
68 WrappingSubAssign, XMulYToZZ,
69};
70use malachite_base::num::basic::integers::PrimitiveInt;
71use malachite_base::num::basic::traits::{One, Zero};
72use malachite_base::num::conversion::traits::{SplitInHalf, WrappingFrom};
73use malachite_base::rounding_modes::RoundingMode::*;
74
75const SQR_FFT_MODF_THRESHOLD: usize = SQR_TOOM3_THRESHOLD * 3;
76
77#[inline]
86pub(crate) fn limbs_square_diagonal(out: &mut [Limb], xs: &[Limb]) {
87 for (i, &x) in xs.iter().enumerate() {
88 let i_2 = i << 1;
89 (out[i_2 | 1], out[i_2]) = DoubleLimb::from(x).square().split_in_half();
90 }
91}
92
93pub_test! {limbs_square_diagonal_add_shl_1(out: &mut [Limb], scratch: &mut [Limb], xs: &[Limb]) {
104 limbs_square_diagonal(out, xs);
105 let (out_last, out_init) = out.split_last_mut().unwrap();
106 *out_last += limbs_slice_shl_in_place(scratch, 1);
107 if limbs_slice_add_same_length_in_place_left(&mut out_init[1..], scratch) {
108 *out_last += 1;
109 }
110}}
111
112pub_crate_test! {limbs_square_to_out_basecase(out: &mut [Limb], xs: &[Limb]) {
130 let n = xs.len();
131 let (xs_head, xs_tail) = xs.split_first().unwrap();
132 (out[1], out[0]) = DoubleLimb::from(*xs_head).square().split_in_half();
133 if n > 1 {
134 assert!(n <= SQR_TOOM2_THRESHOLD);
135 let scratch = &mut [0; SQR_TOOM2_THRESHOLD << 1];
136 let two_n = n << 1;
137 let scratch = &mut scratch[..two_n - 2];
138 let (scratch_last, scratch_init) = scratch[..n].split_last_mut().unwrap();
139 *scratch_last = limbs_mul_limb_to_out::<DoubleLimb, Limb>(scratch_init, xs_tail, *xs_head);
140 for i in 1..n - 1 {
141 let (scratch_last, scratch_init) = scratch[i..][i..n].split_last_mut().unwrap();
142 let (xs_head, xs_tail) = xs[i..].split_first().unwrap();
143 *scratch_last =
144 limbs_slice_add_mul_limb_same_length_in_place_left(scratch_init, xs_tail, *xs_head);
145 }
146 limbs_square_diagonal_add_shl_1(&mut out[..two_n], scratch, xs);
147 }
148}}
149
150pub_const_test! {limbs_square_to_out_toom_2_scratch_len(xs_len: usize) -> usize {
155 (xs_len + Limb::WIDTH as usize) << 1
156}}
157
158const TOOM2_MAYBE_SQR_TOOM2: bool =
162 TUNE_PROGRAM_BUILD || WANT_FAT_BINARY || SQR_TOOM3_THRESHOLD >= 2 * SQR_TOOM2_THRESHOLD;
163
164fn limbs_square_to_out_toom_2_recursive(out: &mut [Limb], xs: &[Limb], scratch: &mut [Limb]) {
173 if !TOOM2_MAYBE_SQR_TOOM2 || xs.len() < SQR_TOOM2_THRESHOLD {
174 limbs_square_to_out_basecase(out, xs);
175 } else {
176 limbs_square_to_out_toom_2(out, xs, scratch);
177 }
178}
179
180pub_test! {limbs_square_to_out_toom_2(out: &mut [Limb], xs: &[Limb], scratch: &mut [Limb]) {
213 let xs_len = xs.len();
214 assert!(xs_len > 1);
215 let out = &mut out[..xs_len << 1];
216 let s = xs_len >> 1;
217 let n = xs_len - s;
218 let (xs_0, xs_1) = xs.split_at(n);
219 if s == n {
220 if limbs_cmp_same_length(xs_0, xs_1) == Less {
221 limbs_sub_same_length_to_out(out, xs_1, xs_0);
222 } else {
223 limbs_sub_same_length_to_out(out, xs_0, xs_1);
224 }
225 } else {
226 let (xs_0_last, xs_0_init) = xs_0.split_last().unwrap();
228 let (out_last, out_init) = out[..n].split_last_mut().unwrap();
229 if *xs_0_last == 0 && limbs_cmp_same_length(xs_0_init, xs_1) == Less {
230 limbs_sub_same_length_to_out(out_init, xs_1, xs_0_init);
231 *out_last = 0;
232 } else {
233 *out_last = *xs_0_last;
234 if limbs_sub_same_length_to_out(out_init, xs_0_init, xs_1) {
235 out_last.wrapping_sub_assign(1);
236 }
237 }
238 }
239 let (v_0, v_inf) = out.split_at_mut(n << 1);
240 let (v_neg_1, scratch_out) = scratch.split_at_mut(n << 1);
241 limbs_square_to_out_toom_2_recursive(v_neg_1, &v_0[..n], scratch_out);
242 limbs_square_to_out_toom_2_recursive(v_inf, xs_1, scratch_out);
243 limbs_square_to_out_toom_2_recursive(v_0, xs_0, scratch_out);
244 let (v_0_lo, v_0_hi) = v_0.split_at_mut(n);
245 let (v_inf_lo, v_inf_hi) = v_inf.split_at_mut(n);
246 let mut carry = Limb::from(limbs_slice_add_same_length_in_place_left(v_inf_lo, v_0_hi));
247 let mut carry2 = carry;
248 if limbs_add_same_length_to_out(v_0_hi, v_inf_lo, v_0_lo) {
249 carry2 += 1;
250 }
251 if limbs_slice_add_greater_in_place_left(v_inf_lo, &v_inf_hi[..s + s - n]) {
252 carry += 1;
253 }
254 if limbs_sub_same_length_in_place_left(&mut out[n..3 * n], v_neg_1) {
255 carry.wrapping_sub_assign(1);
256 }
257 assert!(carry.wrapping_add(1) <= 3);
258 assert!(carry2 <= 2);
259 let carry3 = limbs_slice_add_limb_in_place(&mut out[n << 1..], carry2);
260 let out_hi = &mut out[3 * n..];
261 if carry <= 2 {
262 assert!(!limbs_slice_add_limb_in_place(out_hi, carry));
263 } else if limbs_sub_limb_in_place(out_hi, 1) {
264 assert!(carry3);
265 }
266}}
267
268pub_const_test! {limbs_square_to_out_toom_3_input_size_valid(xs_len: usize) -> bool {
274 xs_len == 3 || xs_len > 4
275}}
276
277pub_const_test! {limbs_square_to_out_toom_3_scratch_len(xs_len: usize) -> usize {
282 3 * xs_len + Limb::WIDTH as usize
283}}
284
285const SMALLER_RECURSION_TOOM_3: bool = true;
288
289#[cfg(feature = "test_build")]
293pub const TOOM3_MAYBE_SQR_TOOM3: bool =
294 TUNE_PROGRAM_BUILD || WANT_FAT_BINARY || SQR_TOOM4_THRESHOLD >= 3 * SQR_TOOM3_THRESHOLD;
295#[cfg(not(feature = "test_build"))]
296const TOOM3_MAYBE_SQR_TOOM3: bool =
297 TUNE_PROGRAM_BUILD || WANT_FAT_BINARY || SQR_TOOM4_THRESHOLD >= 3 * SQR_TOOM3_THRESHOLD;
298
299#[cfg(feature = "test_build")]
303pub const TOOM3_MAYBE_SQR_BASECASE: bool =
304 TUNE_PROGRAM_BUILD || WANT_FAT_BINARY || SQR_TOOM3_THRESHOLD < 3 * SQR_TOOM2_THRESHOLD;
305#[cfg(not(feature = "test_build"))]
306const TOOM3_MAYBE_SQR_BASECASE: bool =
307 TUNE_PROGRAM_BUILD || WANT_FAT_BINARY || SQR_TOOM3_THRESHOLD < 3 * SQR_TOOM2_THRESHOLD;
308
309fn limbs_square_to_out_toom_3_recursive(out: &mut [Limb], xs: &[Limb], scratch: &mut [Limb]) {
318 let n = xs.len();
319 if TOOM3_MAYBE_SQR_BASECASE && n < SQR_TOOM2_THRESHOLD {
320 limbs_square_to_out_basecase(out, xs);
321 } else if !TOOM3_MAYBE_SQR_TOOM3 || n < SQR_TOOM3_THRESHOLD {
322 limbs_square_to_out_toom_2(out, xs, scratch);
323 } else {
324 limbs_square_to_out_toom_3(out, xs, scratch);
325 }
326}
327
328pub_test! {limbs_square_to_out_toom_3(out: &mut [Limb], xs: &[Limb], scratch: &mut [Limb]) {
363 let xs_len = xs.len();
364 let n = xs_len.div_round(3, Ceiling).0;
365 let m = n + 1;
366 let k = m + n;
367 let s = xs_len - (n << 1);
368 assert_ne!(s, 0);
369 assert!(s <= n);
370 split_into_chunks!(xs, n, [xs_0, xs_1], xs_2);
371 split_into_chunks_mut!(scratch, m << 1, [scratch_lo, asm1], as1);
372 let (asm1_last, asm1_init) = asm1[..m].split_last_mut().unwrap();
373 let (as1_last, as1_init) = as1[..m].split_last_mut().unwrap();
374 let scratch_lo = &mut scratch_lo[..n];
375 let mut carry = Limb::from(limbs_add_to_out(scratch_lo, xs_0, xs_2));
376 *as1_last = carry;
377 if limbs_add_same_length_to_out(as1_init, scratch_lo, xs_1) {
378 *as1_last += 1;
379 }
380 if carry == 0 && limbs_cmp_same_length(scratch_lo, xs_1) == Less {
381 limbs_sub_same_length_to_out(asm1_init, xs_1, scratch_lo);
382 } else if limbs_sub_same_length_to_out(asm1_init, scratch_lo, xs_1) {
383 carry.wrapping_sub_assign(1);
384 }
385 *asm1_last = carry;
386 let as2 = &mut out[m..m << 1];
387 let (as2_last, as2_init) = as2.split_last_mut().unwrap();
388 let (as1_lo, as1_hi) = as1_init.split_at_mut(s);
389 let mut carry = Limb::from(limbs_add_same_length_to_out(as2_init, xs_2, as1_lo));
390 if s != n {
391 carry = Limb::from(limbs_add_limb_to_out(&mut as2_init[s..], as1_hi, carry));
392 }
393 carry.wrapping_add_assign(*as1_last);
394 carry = carry.arithmetic_checked_shl(1).unwrap();
395 carry.wrapping_add_assign(limbs_slice_shl_in_place(as2_init, 1));
396 if limbs_sub_same_length_in_place_left(as2_init, xs_0) {
397 carry.wrapping_sub_assign(1);
398 }
399 *as2_last = carry;
400 assert!(*as1_last <= 2);
401 assert!(*asm1_last <= 1);
402 let (scratch_lo, scratch_out) = scratch.split_at_mut(5 * m);
403 if SMALLER_RECURSION_TOOM_3 {
404 let (v_neg_1, asm1) = scratch_lo.split_at_mut(k);
405 let (v_neg_1_last, v_neg_1_init) = v_neg_1.split_last_mut().unwrap();
406 let (asm1_last, asm1_init) = asm1[1..n + 2].split_last().unwrap();
407 limbs_square_to_out_toom_3_recursive(v_neg_1_init, asm1_init, scratch_out);
408 *v_neg_1_last = if *asm1_last != 0 {
409 asm1_last.wrapping_add(limbs_slice_add_mul_limb_same_length_in_place_left(
410 &mut v_neg_1_init[n..],
411 asm1_init,
412 2,
413 ))
414 } else {
415 0
416 };
417 } else {
418 fail_on_untested_path("limbs_square_to_out_toom_3, !SMALLER_RECURSION");
419 let (v_neg_1, asm1) = scratch_lo.split_at_mut(m << 1);
420 limbs_square_to_out_toom_3_recursive(v_neg_1, &asm1[..m], scratch_out);
421 }
422 limbs_square_to_out_toom_3_recursive(&mut scratch_lo[k..], as2, scratch_out);
423 let v_inf = &mut out[n << 2..];
424 limbs_square_to_out_toom_3_recursive(v_inf, xs_2, scratch_out);
425 let v_inf_0 = v_inf[0];
426 let (as1, scratch_out) = &mut scratch[m << 2..].split_at_mut(m);
427 let out_hi = &mut out[n << 1..];
428 if SMALLER_RECURSION_TOOM_3 {
429 let (v_1_last, v_1_init) = out_hi[..k].split_last_mut().unwrap();
430 let (as1_last, as1_init) = as1.split_last_mut().unwrap();
431 limbs_square_to_out_toom_3_recursive(v_1_init, as1_init, scratch_out);
432 let v_1_init = &mut v_1_init[n..];
433 *v_1_last = if *as1_last == 1 {
434 limbs_slice_add_mul_limb_same_length_in_place_left(v_1_init, as1_init, 2)
435 .wrapping_add(1)
436 } else if *as1_last != 0 {
437 as1_last.arithmetic_checked_shl(1u64).unwrap().wrapping_add(
438 limbs_slice_add_mul_limb_same_length_in_place_left(v_1_init, as1_init, 4),
439 )
440 } else {
441 0
442 };
443 } else {
444 let carry = out_hi[k];
445 limbs_square_to_out_toom_3_recursive(out_hi, as1, scratch_out);
446 out_hi[k] = carry;
447 }
448 let (v_neg_1, remainder) = scratch.split_at_mut(k);
449 let (v_2, scratch_out) = remainder.split_at_mut(3 * n + 4);
450 limbs_square_to_out_toom_3_recursive(out, xs_0, scratch_out);
451 limbs_mul_toom_interpolate_5_points(out, v_2, v_neg_1, n, s << 1, false, v_inf_0);
452}}
453
454pub_const_test! {limbs_square_to_out_toom_4_input_size_valid(xs_len: usize) -> bool {
460 xs_len == 4 || xs_len == 7 || xs_len == 8 || xs_len > 9
461}}
462
463pub_const_test! {limbs_square_to_out_toom_4_scratch_len(xs_len: usize) -> usize {
468 3 * xs_len + Limb::WIDTH as usize
469}}
470
471const TOOM4_MAYBE_SQR_TOOM2: bool =
475 TUNE_PROGRAM_BUILD || SQR_TOOM4_THRESHOLD < 4 * SQR_TOOM3_THRESHOLD;
476
477const TOOM4_MAYBE_SQR_TOOM4: bool =
481 TUNE_PROGRAM_BUILD || SQR_TOOM6_THRESHOLD >= 4 * SQR_TOOM4_THRESHOLD;
482
483fn limbs_square_to_out_toom_4_recursive(out: &mut [Limb], xs: &[Limb], scratch: &mut [Limb]) {
492 let n = xs.len();
493 if n < SQR_TOOM2_THRESHOLD {
494 limbs_square_to_out_basecase(out, xs);
497 } else if TOOM4_MAYBE_SQR_TOOM2 && n < SQR_TOOM3_THRESHOLD {
498 limbs_square_to_out_toom_2(out, xs, scratch);
499 } else if !TOOM4_MAYBE_SQR_TOOM4 || n < SQR_TOOM4_THRESHOLD {
500 limbs_square_to_out_toom_3(out, xs, scratch);
501 } else {
502 limbs_square_to_out_toom_4(out, xs, scratch);
503 }
504}
505
506pub_test! {limbs_square_to_out_toom_4(out: &mut [Limb], xs: &[Limb], scratch: &mut [Limb]) {
543 let xs_len = xs.len();
544 let n = (xs_len + 3) >> 2;
545 let s = xs_len - 3 * n;
546 assert_ne!(s, 0);
547 assert!(s <= n);
548 let m = n + 1;
549 let k = m + n;
550 split_into_chunks!(xs, n, [xs_0, xs_1, xs_2], xs_3);
551 let (apx, remainder) = out.split_at_mut(n << 1);
555 let apx = &mut apx[..m];
556 let (v1, amx) = remainder.split_at_mut(m << 1);
557 let amx = &mut amx[..m];
558 let (scratch_lo, scratch_hi) = scratch.split_at_mut((k << 2) + 1);
559 limbs_mul_toom_evaluate_deg_3_poly_in_2_and_neg_2(apx, amx, xs, n, &mut scratch_hi[..m]);
560 limbs_square_to_out_toom_4_recursive(scratch_lo, apx, scratch_hi);
561 let scratch_lo = &mut scratch_lo[k..];
562 limbs_square_to_out_toom_4_recursive(scratch_lo, amx, scratch_hi);
563 let (apx_last, apx_init) = apx.split_last_mut().unwrap();
565 let mut carry = limbs_shl_to_out(apx_init, xs_0, 1);
566 if limbs_slice_add_same_length_in_place_left(apx_init, xs_1) {
567 carry.wrapping_add_assign(1);
568 }
569 carry = carry.arithmetic_checked_shl(1).unwrap();
570 carry.wrapping_add_assign(limbs_slice_shl_in_place(apx_init, 1));
571 if limbs_slice_add_same_length_in_place_left(apx_init, xs_2) {
572 carry.wrapping_add_assign(1);
573 }
574 carry = carry.arithmetic_checked_shl(1).unwrap();
575 carry.wrapping_add_assign(limbs_slice_shl_in_place(apx_init, 1));
576 if limbs_slice_add_greater_in_place_left(apx_init, xs_3) {
577 carry.wrapping_add_assign(1);
578 }
579 *apx_last = carry;
580 assert!(*apx_last < 15);
581 let scratch_lo = &mut scratch_lo[k..];
582 limbs_square_to_out_toom_4_recursive(scratch_lo, apx, scratch_hi);
583 limbs_mul_toom_evaluate_deg_3_poly_in_1_and_neg_1(apx, amx, xs, n, &mut scratch_hi[..m]);
585 limbs_square_to_out_toom_4_recursive(v1, apx, scratch_hi);
586 let scratch_lo = &mut scratch_lo[k..];
587 limbs_square_to_out_toom_4_recursive(scratch_lo, amx, scratch_hi);
588 let (v0, vinf) = out.split_at_mut(n << 1);
589 let vinf = &mut vinf[n << 2..];
590 limbs_square_to_out_toom_4_recursive(v0, xs_0, scratch_hi);
591 limbs_square_to_out_toom_4_recursive(vinf, xs_3, scratch_hi);
592 split_into_chunks_mut!(scratch, k, [v2, vm2, vh, vm1], scratch_hi);
593 limbs_mul_toom_interpolate_7_points(out, n, s << 1, false, vm2, false, vm1, v2, vh, scratch_hi);
594}}
595
596pub_const_test! {limbs_square_to_out_toom_6_input_size_valid(xs_len: usize) -> bool {
602 xs_len == 18 || xs_len > 21 && xs_len != 25 && xs_len != 26 && xs_len != 31
603}}
604
605pub_crate_test! {limbs_square_to_out_toom_6_scratch_len(n: usize) -> usize {
610 (n << 1)
611 + max(
612 (SQR_TOOM6_THRESHOLD << 1) + usize::wrapping_from(Limb::WIDTH) * 6,
613 limbs_square_to_out_toom_4_scratch_len(SQR_TOOM6_THRESHOLD),
614 )
615 - (SQR_TOOM6_THRESHOLD << 1)
616}}
617
618const SQR_TOOM6_MAX: usize = (SQR_TOOM8_THRESHOLD + 6 * 2 - 1).div_ceil(6);
622
623const TOOM6_MAYBE_SQR_BASECASE: bool =
627 TUNE_PROGRAM_BUILD || SQR_TOOM6_THRESHOLD < 6 * SQR_TOOM2_THRESHOLD;
628
629const TOOM6_MAYBE_SQR_TOOM2: bool =
633 TUNE_PROGRAM_BUILD || SQR_TOOM6_THRESHOLD < 6 * SQR_TOOM3_THRESHOLD;
634
635const TOOM6_MAYBE_SQR_ABOVE_TOOM2: bool =
639 TUNE_PROGRAM_BUILD || SQR_TOOM6_MAX >= SQR_TOOM3_THRESHOLD;
640
641const TOOM6_MAYBE_SQR_TOOM3: bool =
645 TUNE_PROGRAM_BUILD || SQR_TOOM6_THRESHOLD < 6 * SQR_TOOM4_THRESHOLD;
646
647const TOOM6_MAYBE_SQR_ABOVE_TOOM3: bool =
651 TUNE_PROGRAM_BUILD || SQR_TOOM6_MAX >= SQR_TOOM4_THRESHOLD;
652
653const TOOM6_MAYBE_SQR_ABOVE_TOOM4: bool =
657 TUNE_PROGRAM_BUILD || SQR_TOOM6_MAX >= SQR_TOOM6_THRESHOLD;
658
659fn limbs_square_to_out_toom_6_recursive(out: &mut [Limb], xs: &[Limb], scratch: &mut [Limb]) {
668 let n = xs.len();
669 if TOOM6_MAYBE_SQR_BASECASE && n < SQR_TOOM2_THRESHOLD {
670 limbs_square_to_out_basecase(out, xs);
671 } else if TOOM6_MAYBE_SQR_TOOM2 && (!TOOM6_MAYBE_SQR_ABOVE_TOOM2 || n < SQR_TOOM3_THRESHOLD) {
672 limbs_square_to_out_toom_2(out, xs, scratch);
673 } else if TOOM6_MAYBE_SQR_TOOM3 && (!TOOM6_MAYBE_SQR_ABOVE_TOOM3 || n < SQR_TOOM4_THRESHOLD) {
674 limbs_square_to_out_toom_3(out, xs, scratch);
675 } else if !TOOM6_MAYBE_SQR_ABOVE_TOOM4 || n < SQR_TOOM6_THRESHOLD {
676 limbs_square_to_out_toom_4(out, xs, scratch);
677 } else {
678 limbs_square_to_out_toom_6(out, xs, scratch);
679 }
680}
681
682pub_test! {limbs_square_to_out_toom_6(out: &mut [Limb], xs: &[Limb], scratch: &mut [Limb]) {
704 let xs_len = xs.len();
705 assert!(xs_len >= 18);
706 let n = 1 + (xs_len - 1) / 6;
707 assert!(xs_len > 5 * n);
708 let s = xs_len - 5 * n;
709 assert!(s <= n);
710 assert!(10 * n + 3 <= xs_len << 1);
711 let m = n + 1;
712 let k = m + n;
713 let (out_lo, remainder) = out.split_at_mut(3 * n);
714 let (r4, r2) = remainder.split_at_mut(n << 2);
715 let (v0, v2) = r2.split_at_mut(m << 1);
716 let v0 = &mut v0[..m];
717 let v2 = &mut v2[..m];
718 limbs_mul_toom_evaluate_poly_in_2_pow_neg_and_neg_2_pow_neg(
720 v2,
721 v0,
722 5,
723 xs,
724 n,
725 1,
726 &mut out_lo[..m],
727 );
728 split_into_chunks_mut!(scratch, 3 * n + 1, [r5, r3, r1], wse);
729 limbs_square_to_out_toom_6_recursive(out_lo, v0, wse); limbs_square_to_out_toom_6_recursive(r5, v2, wse); limbs_toom_couple_handling(r5, &mut out_lo[..k], false, n, 1, 0);
732 limbs_mul_toom_evaluate_poly_in_1_and_neg_1(v2, v0, 5, xs, n, &mut out_lo[..m]);
734 limbs_square_to_out_toom_6_recursive(out_lo, v0, wse); limbs_square_to_out_toom_6_recursive(r3, v2, wse); limbs_toom_couple_handling(r3, &mut out_lo[..k], false, n, 0, 0);
737 limbs_mul_toom_evaluate_poly_in_2_pow_and_neg_2_pow(v2, v0, 5, xs, n, 2, &mut out_lo[..m]);
739 limbs_square_to_out_toom_6_recursive(out_lo, v0, wse); limbs_square_to_out_toom_6_recursive(r1, v2, wse); limbs_toom_couple_handling(r1, &mut out_lo[..k], false, n, 2, 4);
742 limbs_mul_toom_evaluate_poly_in_2_pow_neg_and_neg_2_pow_neg(
744 v2,
745 v0,
746 5,
747 xs,
748 n,
749 2,
750 &mut out_lo[..m],
751 );
752 limbs_square_to_out_toom_6_recursive(out_lo, v0, wse); limbs_square_to_out_toom_6_recursive(r4, v2, wse); limbs_toom_couple_handling(r4, &mut out_lo[..k], false, n, 2, 0);
755 limbs_mul_toom_evaluate_poly_in_2_and_neg_2(v2, v0, 5, xs, n, &mut out_lo[..m]);
757 limbs_square_to_out_toom_6_recursive(out_lo, v0, wse); let (v0, v2) = r2.split_at_mut(m << 1);
759 limbs_square_to_out_toom_6_recursive(v0, &v2[..m], wse); limbs_toom_couple_handling(r2, &mut out_lo[..k], false, n, 1, 2);
761 limbs_square_to_out_toom_6_recursive(out_lo, &xs[..n], wse); limbs_mul_toom_interpolate_12_points(out, r1, r3, r5, n, s << 1, false, wse);
763}}
764
765pub(crate) const SQR_FFT_THRESHOLD: usize = SQR_FFT_MODF_THRESHOLD * 10;
767
768pub_const_test! {limbs_square_to_out_toom_8_input_size_valid(xs_len: usize) -> bool {
774 xs_len == 40 || xs_len > 43 && xs_len != 49 && xs_len != 50 && xs_len != 57
775}}
776
777pub_crate_test! {limbs_square_to_out_toom_8_scratch_len(n: usize) -> usize {
782 ((n * 15) >> 3)
783 + max(
784 ((SQR_TOOM8_THRESHOLD * 15) >> 3) + usize::wrapping_from(Limb::WIDTH) * 6,
785 limbs_square_to_out_toom_6_scratch_len(SQR_TOOM8_THRESHOLD),
786 )
787 - ((SQR_TOOM8_THRESHOLD * 15) >> 3)
788}}
789
790const SQR_TOOM8_MAX: usize = if SQR_FFT_THRESHOLD <= usize::MAX - (8 * 2 - 1 + 7) {
794 (SQR_FFT_THRESHOLD + 8 * 2 - 1).div_ceil(8)
795} else {
796 usize::MAX
797};
798
799const TOOM8_MAYBE_SQR_BASECASE: bool =
803 TUNE_PROGRAM_BUILD || SQR_TOOM8_THRESHOLD < 8 * SQR_TOOM2_THRESHOLD;
804
805const TOOM8_MAYBE_SQR_ABOVE_BASECASE: bool =
809 TUNE_PROGRAM_BUILD || SQR_TOOM8_MAX >= SQR_TOOM2_THRESHOLD;
810
811const TOOM8_MAYBE_SQR_TOOM2: bool =
815 TUNE_PROGRAM_BUILD || SQR_TOOM8_THRESHOLD < 8 * SQR_TOOM3_THRESHOLD;
816
817const TOOM8_MAYBE_SQR_ABOVE_TOOM2: bool =
821 TUNE_PROGRAM_BUILD || SQR_TOOM8_MAX >= SQR_TOOM3_THRESHOLD;
822
823const TOOM8_MAYBE_SQR_TOOM3: bool =
827 TUNE_PROGRAM_BUILD || SQR_TOOM8_THRESHOLD < 8 * SQR_TOOM4_THRESHOLD;
828
829const TOOM8_MAYBE_SQR_ABOVE_TOOM3: bool =
833 TUNE_PROGRAM_BUILD || SQR_TOOM8_MAX >= SQR_TOOM4_THRESHOLD;
834
835const TOOM8_MAYBE_SQR_TOOM4: bool =
839 TUNE_PROGRAM_BUILD || SQR_TOOM8_THRESHOLD < 8 * SQR_TOOM6_THRESHOLD;
840
841const TOOM8_MAYBE_SQR_ABOVE_TOOM4: bool =
845 TUNE_PROGRAM_BUILD || SQR_TOOM8_MAX >= SQR_TOOM6_THRESHOLD;
846
847const TOOM8_MAYBE_SQR_ABOVE_TOOM6: bool =
851 TUNE_PROGRAM_BUILD || SQR_TOOM8_MAX >= SQR_TOOM8_THRESHOLD;
852
853fn limbs_square_to_out_toom_8_recursive(out: &mut [Limb], xs: &[Limb], scratch: &mut [Limb]) {
863 let n = xs.len();
864 if TOOM8_MAYBE_SQR_BASECASE && (!TOOM8_MAYBE_SQR_ABOVE_BASECASE || n < SQR_TOOM2_THRESHOLD) {
865 limbs_square_to_out_basecase(out, xs);
866 } else if TOOM8_MAYBE_SQR_TOOM2 && (!TOOM8_MAYBE_SQR_ABOVE_TOOM2 || n < SQR_TOOM3_THRESHOLD) {
867 limbs_square_to_out_toom_2(out, xs, scratch);
868 } else if TOOM8_MAYBE_SQR_TOOM3 && (!TOOM8_MAYBE_SQR_ABOVE_TOOM3 || n < SQR_TOOM4_THRESHOLD) {
869 limbs_square_to_out_toom_3(out, xs, scratch);
870 } else if TOOM8_MAYBE_SQR_TOOM4 && (!TOOM8_MAYBE_SQR_ABOVE_TOOM4 || n < SQR_TOOM6_THRESHOLD) {
871 limbs_square_to_out_toom_4(out, xs, scratch);
872 } else if !TOOM8_MAYBE_SQR_ABOVE_TOOM6 || n < SQR_TOOM8_THRESHOLD {
873 limbs_square_to_out_toom_6(out, xs, scratch);
874 } else {
875 limbs_square_to_out_toom_8(out, xs, scratch);
876 }
877}
878
879pub_test! {limbs_square_to_out_toom_8(out: &mut [Limb], xs: &[Limb], scratch: &mut [Limb]) {
901 let xs_len = xs.len();
902 assert!(xs_len >= 40);
903 let n: usize = xs_len.shr_round(3, Ceiling).0;
904 let m = n + 1;
905 let k = m + n;
906 let p = k + n;
907 assert!(xs_len > 7 * n);
908 let s = xs_len - 7 * n;
909 assert!(s <= n);
910 assert!(s << 1 > 3);
911 let (pp_lo, remainder) = out.split_at_mut(3 * n);
912 split_into_chunks_mut!(remainder, n << 2, [r6, r4], pp_hi);
913 split_into_chunks_mut!(pp_hi, m, [v0, _unused, v2], _unused);
914 limbs_mul_toom_evaluate_poly_in_2_pow_neg_and_neg_2_pow_neg(
916 v2,
917 v0,
918 7,
919 xs,
920 n,
921 3,
922 &mut pp_lo[..m],
923 );
924 let (r7_r5, remainder) = scratch.split_at_mut(p << 1);
925 let (r3, r1_wse) = remainder.split_at_mut(p);
926 let (r1, wse) = r1_wse.split_at_mut(p);
927 limbs_square_to_out_toom_8_recursive(pp_lo, v0, wse);
929 limbs_square_to_out_toom_8_recursive(r7_r5, v2, wse);
930 let limit = if BIT_CORRECTION { m << 1 } else { k };
931 limbs_toom_couple_handling(r7_r5, &mut pp_lo[..limit], false, n, 3, 0);
932 limbs_mul_toom_evaluate_poly_in_2_pow_neg_and_neg_2_pow_neg(
934 v2,
935 v0,
936 7,
937 xs,
938 n,
939 2,
940 &mut pp_lo[..m],
941 );
942 limbs_square_to_out_toom_8_recursive(pp_lo, v0, wse);
944 let (r7, r5) = r7_r5.split_at_mut(p);
945 limbs_square_to_out_toom_8_recursive(r5, v2, wse);
946 limbs_toom_couple_handling(r5, &mut pp_lo[..k], false, n, 2, 0);
947 limbs_mul_toom_evaluate_poly_in_2_and_neg_2(v2, v0, 7, xs, n, &mut pp_lo[..m]);
949 limbs_square_to_out_toom_8_recursive(pp_lo, v0, wse);
951 limbs_square_to_out_toom_8_recursive(r3, v2, wse);
952 limbs_toom_couple_handling(r3, &mut pp_lo[..k], false, n, 1, 2);
953 limbs_mul_toom_evaluate_poly_in_2_pow_and_neg_2_pow(v2, v0, 7, xs, n, 3, &mut pp_lo[..m]);
955 limbs_square_to_out_toom_8_recursive(pp_lo, v0, wse);
957 limbs_square_to_out_toom_8_recursive(r1, v2, wse);
958 limbs_toom_couple_handling(r1_wse, &mut pp_lo[..limit], false, n, 3, 6);
959 limbs_mul_toom_evaluate_poly_in_2_pow_neg_and_neg_2_pow_neg(
961 v2,
962 v0,
963 7,
964 xs,
965 n,
966 1,
967 &mut pp_lo[..m],
968 );
969 let (r1, wse) = r1_wse.split_at_mut(p);
971 limbs_square_to_out_toom_8_recursive(pp_lo, v0, wse);
972 limbs_square_to_out_toom_8_recursive(r6, v2, wse);
973 limbs_toom_couple_handling(r6, &mut pp_lo[..k], false, n, 1, 0);
974 limbs_mul_toom_evaluate_poly_in_1_and_neg_1(v2, v0, 7, xs, n, &mut pp_lo[..m]);
976 limbs_square_to_out_toom_8_recursive(pp_lo, v0, wse);
978 limbs_square_to_out_toom_8_recursive(r4, v2, wse);
979 limbs_toom_couple_handling(r4, &mut pp_lo[..k], false, n, 0, 0);
980 limbs_mul_toom_evaluate_poly_in_2_pow_and_neg_2_pow(v2, v0, 7, xs, n, 2, &mut pp_lo[..m]);
982 limbs_square_to_out_toom_8_recursive(pp_lo, v0, wse);
984 let (r2, v2) = pp_hi.split_at_mut(m << 1);
985 limbs_square_to_out_toom_8_recursive(r2, &v2[..m], wse);
986 limbs_toom_couple_handling(pp_hi, &mut pp_lo[..k], false, n, 2, 4);
987 limbs_square_to_out_toom_8_recursive(pp_lo, &xs[..n], wse);
989 limbs_mul_toom_interpolate_16_points(out, r1, r3, r5, r7, n, s << 1, false, &mut wse[..p]);
990}}
991
992pub_crate_test! {
993#[allow(clippy::absurd_extreme_comparisons)]
994limbs_square_to_out_scratch_len(n: usize) -> usize {
995 if n < SQR_BASECASE_THRESHOLD || n < SQR_TOOM2_THRESHOLD {
996 0
997 } else if n < SQR_TOOM3_THRESHOLD {
998 limbs_square_to_out_toom_2_scratch_len(n)
999 } else if n < SQR_TOOM4_THRESHOLD {
1000 limbs_square_to_out_toom_3_scratch_len(n)
1001 } else if n < SQR_TOOM6_THRESHOLD {
1002 limbs_square_to_out_toom_4_scratch_len(n)
1003 } else if n < SQR_TOOM8_THRESHOLD {
1004 limbs_square_to_out_toom_6_scratch_len(n)
1005 } else if n < SQR_FFT_THRESHOLD {
1006 limbs_square_to_out_toom_8_scratch_len(n)
1007 } else {
1008 0
1009 }
1010}}
1011
1012pub_crate_test! {
1021#[allow(clippy::absurd_extreme_comparisons)]
1022limbs_square_to_out(out: &mut [Limb], xs: &[Limb], scratch: &mut [Limb]) {
1023 let n = xs.len();
1024 assert!(n >= 1);
1025 if n < SQR_BASECASE_THRESHOLD {
1026 limbs_mul_greater_to_out_basecase(out, xs, xs);
1029 } else if n < SQR_TOOM2_THRESHOLD {
1030 limbs_square_to_out_basecase(out, xs);
1031 } else if n < SQR_TOOM3_THRESHOLD {
1032 limbs_square_to_out_toom_2(out, xs, scratch);
1033 } else if n < SQR_TOOM4_THRESHOLD {
1034 limbs_square_to_out_toom_3(out, xs, scratch);
1035 } else if n < SQR_TOOM6_THRESHOLD {
1036 limbs_square_to_out_toom_4(out, xs, scratch);
1037 } else if n < SQR_TOOM8_THRESHOLD {
1038 limbs_square_to_out_toom_6(out, xs, scratch);
1039 } else if n < SQR_FFT_THRESHOLD {
1040 limbs_square_to_out_toom_8(out, xs, scratch);
1041 } else {
1042 mpn_square_default_mpn_ctx(out, xs);
1043 }
1044}}
1045
1046pub_crate_test! {limbs_square(xs: &[Limb]) -> Vec<Limb> {
1047 let mut out = vec![0; xs.len() << 1];
1048 let mut square_scratch = vec![0; limbs_square_to_out_scratch_len(xs.len())];
1049 limbs_square_to_out(&mut out, xs, &mut square_scratch);
1050 out
1051}}
1052
1053impl Square for Natural {
1054 type Output = Self;
1055
1056 #[inline]
1079 fn square(mut self) -> Self {
1080 self.square_assign();
1081 self
1082 }
1083}
1084
1085impl Square for &Natural {
1086 type Output = Natural;
1087
1088 #[inline]
1111 fn square(self) -> Natural {
1112 match self {
1113 &Natural::ZERO | &Natural::ONE => self.clone(),
1114 Natural(Small(x)) => Natural({
1115 let (upper, lower) = Limb::x_mul_y_to_zz(*x, *x);
1116 if upper == 0 {
1117 Small(lower)
1118 } else {
1119 Large(vec![lower, upper])
1120 }
1121 }),
1122 Natural(Large(xs)) => Natural::from_owned_limbs_asc(limbs_square(xs)),
1123 }
1124 }
1125}
1126
1127impl SquareAssign for Natural {
1128 fn square_assign(&mut self) {
1156 match self {
1157 &mut (Self::ZERO | Self::ONE) => {}
1158 Self(Small(x)) => {
1159 let (upper, lower) = Limb::x_mul_y_to_zz(*x, *x);
1160 if upper == 0 {
1161 *x = lower;
1162 } else {
1163 *self = Self(Large(vec![lower, upper]));
1164 }
1165 }
1166 Self(Large(xs)) => {
1167 *xs = limbs_square(xs);
1168 self.trim();
1169 }
1170 }
1171 }
1172}