Skip to main content

malachite_nz/natural/arithmetic/
square.rs

1// Copyright © 2026 Mikhail Hogrefe
2//
3// Uses code adopted from the GNU MP Library.
4//
5//      `MAYBE_sqr_toom2`, `TOOM2_SQR_REC`, and `mpn_toom2_sqr` contributed to the GNU project by
6//      Torbjörn Granlund.
7//
8//      `SQR_TOOM6_MAX`, `MAYBE_sqr_basecase`, `MAYBE_sqr_toom2`, `MAYBE_sqr_above_toom2`,
9//      `MAYBE_sqr_toom3`, `MAYBE_sqr_above_toom3`, `MAYBE_sqr_above_toom4`, `TOOM6_SQR_REC`,
10//      `mpn_toom6_sqr`, `SQR_TOOM8_MAX`, `MAYBE_sqr_basecase`, `MAYBE_sqr_above_basecase`,
11//      `MAYBE_sqr_toom2`, `MAYBE_sqr_above_toom2`, `MAYBE_sqr_toom3`, `MAYBE_sqr_above_toom3`,
12//      `MAYBE_sqr_toom4`, `MAYBE_sqr_above_toom4`, `MAYBE_sqr_above_toom6`, `TOOM8_SQR_REC`, and
13//      `mpn_toom8_sqr` contributed to the GNU project by Marco Bodrato.
14//
15//      `MAYBE_sqr_toom3`, `MAYBE_sqr_basecase`, `TOOM3_SQR_REC`, and `mpn_toom3_sqr` contributed to
16//      the GNU project by Torbjörn Granlund, with additional improvements by Marco Bodrato.
17//
18//      `MAYBE_sqr_toom2`, `MAYBE_sqr_toom4`, `TOOM4_SQR_REC`, and `mpn_toom4_sqr` contributed to
19//      the GNU project by Torbjörn Granlund and Marco Bodrato.
20//
21//      Copyright © 1991-2018 Free Software Foundation, Inc.
22//
23// This file is part of Malachite.
24//
25// Malachite is free software: you can redistribute it and/or modify it under the terms of the GNU
26// Lesser General Public License (LGPL) as published by the Free Software Foundation; either version
27// 3 of the License, or (at your option) any later version. See <https://www.gnu.org/licenses/>.
28
29use crate::natural::InnerNatural::{Large, Small};
30use crate::natural::Natural;
31use crate::natural::arithmetic::add::{
32    limbs_add_limb_to_out, limbs_add_same_length_to_out, limbs_add_to_out,
33    limbs_slice_add_greater_in_place_left, limbs_slice_add_limb_in_place,
34    limbs_slice_add_same_length_in_place_left,
35};
36use crate::natural::arithmetic::add_mul::limbs_slice_add_mul_limb_same_length_in_place_left;
37use crate::natural::arithmetic::mul::fft::mpn_square_default_mpn_ctx;
38use crate::natural::arithmetic::mul::limb::limbs_mul_limb_to_out;
39use crate::natural::arithmetic::mul::limbs_mul_greater_to_out_basecase;
40use crate::natural::arithmetic::mul::poly_eval::{
41    limbs_mul_toom_evaluate_deg_3_poly_in_1_and_neg_1,
42    limbs_mul_toom_evaluate_deg_3_poly_in_2_and_neg_2, limbs_mul_toom_evaluate_poly_in_1_and_neg_1,
43    limbs_mul_toom_evaluate_poly_in_2_and_neg_2,
44    limbs_mul_toom_evaluate_poly_in_2_pow_and_neg_2_pow,
45    limbs_mul_toom_evaluate_poly_in_2_pow_neg_and_neg_2_pow_neg,
46};
47use crate::natural::arithmetic::mul::poly_interpolate::{
48    limbs_mul_toom_interpolate_5_points, limbs_mul_toom_interpolate_7_points,
49    limbs_mul_toom_interpolate_12_points, limbs_mul_toom_interpolate_16_points,
50};
51use crate::natural::arithmetic::mul::toom::{
52    BIT_CORRECTION, TUNE_PROGRAM_BUILD, WANT_FAT_BINARY, limbs_toom_couple_handling,
53};
54use crate::natural::arithmetic::shl::{limbs_shl_to_out, limbs_slice_shl_in_place};
55use crate::natural::arithmetic::sub::{
56    limbs_sub_limb_in_place, limbs_sub_same_length_in_place_left, limbs_sub_same_length_to_out,
57};
58use crate::natural::comparison::cmp::limbs_cmp_same_length;
59use crate::platform::{
60    DoubleLimb, Limb, SQR_BASECASE_THRESHOLD, SQR_TOOM2_THRESHOLD, SQR_TOOM3_THRESHOLD,
61    SQR_TOOM4_THRESHOLD, SQR_TOOM6_THRESHOLD, SQR_TOOM8_THRESHOLD,
62};
63use alloc::vec::Vec;
64use core::cmp::{Ordering::*, max};
65use malachite_base::fail_on_untested_path;
66use malachite_base::num::arithmetic::traits::{
67    ArithmeticCheckedShl, DivRound, ShrRound, Square, SquareAssign, WrappingAddAssign,
68    WrappingSubAssign, XMulYToZZ,
69};
70use malachite_base::num::basic::integers::PrimitiveInt;
71use malachite_base::num::basic::traits::{One, Zero};
72use malachite_base::num::conversion::traits::{SplitInHalf, WrappingFrom};
73use malachite_base::rounding_modes::RoundingMode::*;
74
75const SQR_FFT_MODF_THRESHOLD: usize = SQR_TOOM3_THRESHOLD * 3;
76
77// # Worst-case complexity
78// $T(n) = O(n)$
79//
80// $M(n) = O(1)$
81//
82// where $T$ is time, $M$ is additional memory, and $n$ is `xs.len()`.
83//
84// This is equivalent to `MPN_SQR_DIAGONAL` from `mpn/generic/sqr_basecase.c`, GMP 6.2.1.
85#[inline]
86pub(crate) fn limbs_square_diagonal(out: &mut [Limb], xs: &[Limb]) {
87    for (i, &x) in xs.iter().enumerate() {
88        let i_2 = i << 1;
89        (out[i_2 | 1], out[i_2]) = DoubleLimb::from(x).square().split_in_half();
90    }
91}
92
93// scratch must have length 2 * xs.len() - 2 and out must have length 2 * xs.len().
94//
95// # Worst-case complexity
96// $T(n) = O(n)$
97//
98// $M(n) = O(1)$
99//
100// where $T$ is time, $M$ is additional memory, and $n$ is `xs.len()`.
101//
102// This is equivalent to `MPN_SQR_DIAG_ADDLSH1` from `mpn/generic/sqr_basecase.c`, GMP 6.2.1.
103pub_test! {limbs_square_diagonal_add_shl_1(out: &mut [Limb], scratch: &mut [Limb], xs: &[Limb]) {
104    limbs_square_diagonal(out, xs);
105    let (out_last, out_init) = out.split_last_mut().unwrap();
106    *out_last += limbs_slice_shl_in_place(scratch, 1);
107    if limbs_slice_add_same_length_in_place_left(&mut out_init[1..], scratch) {
108        *out_last += 1;
109    }
110}}
111
112// Interpreting a slices of `Limb`s as the limbs (in ascending order) of a `Natural`s, writes the `2
113// * xs.len()` least-significant limbs of the square of the `Natural`s to an output slice. The
114// output must be at least twice as long as `xs.len()`, `xs.len()` must be less than
115// `SQR_TOOM2_THRESHOLD`, and `xs` cannot be empty.
116//
117// # Worst-case complexity
118// $T(n) = O(n^2)$
119//
120// $M(n) = O(1)$
121//
122// where $T$ is time, $M$ is additional memory, and $n$ is `xs.len()`.
123//
124// # Panics
125// Panics if `out` is less than twice the length of `xs`, `xs.len()` > SQR_TOOM2_THRESHOLD, or if
126// `xs` is empty.
127//
128// This is equivalent to `mpn_sqr_basecase` from `mpn/generic/sqr_basecase.c`, GMP 6.2.1.
129pub_crate_test! {limbs_square_to_out_basecase(out: &mut [Limb], xs: &[Limb]) {
130    let n = xs.len();
131    let (xs_head, xs_tail) = xs.split_first().unwrap();
132    (out[1], out[0]) = DoubleLimb::from(*xs_head).square().split_in_half();
133    if n > 1 {
134        assert!(n <= SQR_TOOM2_THRESHOLD);
135        let scratch = &mut [0; SQR_TOOM2_THRESHOLD << 1];
136        let two_n = n << 1;
137        let scratch = &mut scratch[..two_n - 2];
138        let (scratch_last, scratch_init) = scratch[..n].split_last_mut().unwrap();
139        *scratch_last = limbs_mul_limb_to_out::<DoubleLimb, Limb>(scratch_init, xs_tail, *xs_head);
140        for i in 1..n - 1 {
141            let (scratch_last, scratch_init) = scratch[i..][i..n].split_last_mut().unwrap();
142            let (xs_head, xs_tail) = xs[i..].split_first().unwrap();
143            *scratch_last =
144                limbs_slice_add_mul_limb_same_length_in_place_left(scratch_init, xs_tail, *xs_head);
145        }
146        limbs_square_diagonal_add_shl_1(&mut out[..two_n], scratch, xs);
147    }
148}}
149
150// # Worst-case complexity
151// Constant time and additional memory.
152//
153// This is equivalent to `mpn_toom2_sqr_itch` from `gmp-impl.h`, GMP 6.2.1.
154pub_const_test! {limbs_square_to_out_toom_2_scratch_len(xs_len: usize) -> usize {
155    (xs_len + Limb::WIDTH as usize) << 1
156}}
157
158// TODO tune
159
160/// This is equivalent to `MAYBE_sqr_toom2` from `mpn/generic/toom2_sqr.c`, GMP 6.2.1.
161const TOOM2_MAYBE_SQR_TOOM2: bool =
162    TUNE_PROGRAM_BUILD || WANT_FAT_BINARY || SQR_TOOM3_THRESHOLD >= 2 * SQR_TOOM2_THRESHOLD;
163
164// # Worst-case complexity
165// $T(n) = O(n^{\log_2 3}) \approx O(n^{1.585})$
166//
167// $M(n) = O(1)$
168//
169// where $T$ is time, $M$ is additional memory, and $n$ is `xs.len()`.
170//
171// This is equivalent to `TOOM2_SQR_REC` from `mpn/generic/toom2_sqr.c`, GMP 6.2.1.
172fn limbs_square_to_out_toom_2_recursive(out: &mut [Limb], xs: &[Limb], scratch: &mut [Limb]) {
173    if !TOOM2_MAYBE_SQR_TOOM2 || xs.len() < SQR_TOOM2_THRESHOLD {
174        limbs_square_to_out_basecase(out, xs);
175    } else {
176        limbs_square_to_out_toom_2(out, xs, scratch);
177    }
178}
179
180// Interpreting a slices of `Limb`s as the limbs (in ascending order) of a `Natural`, writes the `2
181// * xs.len()` least-significant limbs of the square of the `Natural` to an output slice. A scratch
182// slice is provided for the algorithm to use. An upper bound for the number of scratch limbs needed
183// is provided by `limbs_square_to_out_toom_2_scratch_len`. The following restrictions on the input
184// slices must be met:
185// - `out`.len() >= 2 * `xs`.len()
186// - `xs`.len() > 1
187//
188// The smallest allowable `xs` length is 2.
189//
190// Evaluate in: -1, 0, infinity.
191// ```
192// <-s--><--n-->
193//  ____ ______
194// |xs1_|__xs0_|
195//
196// v_0     = xs_0 ^ 2          # X(0) ^ 2
197// v_neg_1 = (xs_0 - xs_1) ^ 2 # X(-1) ^ 2
198// v_inf   = xs_1 ^ 2          # X(inf) ^ 2
199// ```
200//
201// # Worst-case complexity
202// $T(n) = O(n^{\log_2 3}) \approx O(n^{1.585})$
203//
204// $M(n) = O(1)$
205//
206// where $T$ is time, $M$ is additional memory, and $n$ is `xs.len()`.
207//
208// # Panics
209// May panic if the input slice conditions are not met.
210//
211// This is equivalent to `mpn_toom2_sqr` from `mpn/generic/toom2_sqr.c`, GMP 6.2.1.
212pub_test! {limbs_square_to_out_toom_2(out: &mut [Limb], xs: &[Limb], scratch: &mut [Limb]) {
213    let xs_len = xs.len();
214    assert!(xs_len > 1);
215    let out = &mut out[..xs_len << 1];
216    let s = xs_len >> 1;
217    let n = xs_len - s;
218    let (xs_0, xs_1) = xs.split_at(n);
219    if s == n {
220        if limbs_cmp_same_length(xs_0, xs_1) == Less {
221            limbs_sub_same_length_to_out(out, xs_1, xs_0);
222        } else {
223            limbs_sub_same_length_to_out(out, xs_0, xs_1);
224        }
225    } else {
226        // n - s == 1
227        let (xs_0_last, xs_0_init) = xs_0.split_last().unwrap();
228        let (out_last, out_init) = out[..n].split_last_mut().unwrap();
229        if *xs_0_last == 0 && limbs_cmp_same_length(xs_0_init, xs_1) == Less {
230            limbs_sub_same_length_to_out(out_init, xs_1, xs_0_init);
231            *out_last = 0;
232        } else {
233            *out_last = *xs_0_last;
234            if limbs_sub_same_length_to_out(out_init, xs_0_init, xs_1) {
235                out_last.wrapping_sub_assign(1);
236            }
237        }
238    }
239    let (v_0, v_inf) = out.split_at_mut(n << 1);
240    let (v_neg_1, scratch_out) = scratch.split_at_mut(n << 1);
241    limbs_square_to_out_toom_2_recursive(v_neg_1, &v_0[..n], scratch_out);
242    limbs_square_to_out_toom_2_recursive(v_inf, xs_1, scratch_out);
243    limbs_square_to_out_toom_2_recursive(v_0, xs_0, scratch_out);
244    let (v_0_lo, v_0_hi) = v_0.split_at_mut(n);
245    let (v_inf_lo, v_inf_hi) = v_inf.split_at_mut(n);
246    let mut carry = Limb::from(limbs_slice_add_same_length_in_place_left(v_inf_lo, v_0_hi));
247    let mut carry2 = carry;
248    if limbs_add_same_length_to_out(v_0_hi, v_inf_lo, v_0_lo) {
249        carry2 += 1;
250    }
251    if limbs_slice_add_greater_in_place_left(v_inf_lo, &v_inf_hi[..s + s - n]) {
252        carry += 1;
253    }
254    if limbs_sub_same_length_in_place_left(&mut out[n..3 * n], v_neg_1) {
255        carry.wrapping_sub_assign(1);
256    }
257    assert!(carry.wrapping_add(1) <= 3);
258    assert!(carry2 <= 2);
259    let carry3 = limbs_slice_add_limb_in_place(&mut out[n << 1..], carry2);
260    let out_hi = &mut out[3 * n..];
261    if carry <= 2 {
262        assert!(!limbs_slice_add_limb_in_place(out_hi, carry));
263    } else if limbs_sub_limb_in_place(out_hi, 1) {
264        assert!(carry3);
265    }
266}}
267
268// This function can be used to determine whether the size of the input slice to
269// `limbs_square_to_out_toom_3` is valid.
270//
271// # Worst-case complexity
272// Constant time and additional memory.
273pub_const_test! {limbs_square_to_out_toom_3_input_size_valid(xs_len: usize) -> bool {
274    xs_len == 3 || xs_len > 4
275}}
276
277// # Worst-case complexity
278// Constant time and additional memory.
279//
280// This is equivalent to `mpn_toom3_sqr_itch` from `gmp-impl.h`, GMP 6.2.1.
281pub_const_test! {limbs_square_to_out_toom_3_scratch_len(xs_len: usize) -> usize {
282    3 * xs_len + Limb::WIDTH as usize
283}}
284
285// TODO tune
286
287const SMALLER_RECURSION_TOOM_3: bool = true;
288
289// TODO tune
290
291// This is equivalent to `MAYBE_sqr_toom3` from `mpn/generic/toom3_sqr.c`, GMP 6.2.1.
292#[cfg(feature = "test_build")]
293pub const TOOM3_MAYBE_SQR_TOOM3: bool =
294    TUNE_PROGRAM_BUILD || WANT_FAT_BINARY || SQR_TOOM4_THRESHOLD >= 3 * SQR_TOOM3_THRESHOLD;
295#[cfg(not(feature = "test_build"))]
296const TOOM3_MAYBE_SQR_TOOM3: bool =
297    TUNE_PROGRAM_BUILD || WANT_FAT_BINARY || SQR_TOOM4_THRESHOLD >= 3 * SQR_TOOM3_THRESHOLD;
298
299// TODO tune
300
301/// This is equivalent to `MAYBE_sqr_basecase` from `mpn/generic/toom3_sqr.c`, GMP 6.2.1.
302#[cfg(feature = "test_build")]
303pub const TOOM3_MAYBE_SQR_BASECASE: bool =
304    TUNE_PROGRAM_BUILD || WANT_FAT_BINARY || SQR_TOOM3_THRESHOLD < 3 * SQR_TOOM2_THRESHOLD;
305#[cfg(not(feature = "test_build"))]
306const TOOM3_MAYBE_SQR_BASECASE: bool =
307    TUNE_PROGRAM_BUILD || WANT_FAT_BINARY || SQR_TOOM3_THRESHOLD < 3 * SQR_TOOM2_THRESHOLD;
308
309// # Worst-case complexity
310// $T(n) = O(n^{\log_3 5}) \approx O(n^{1.465})$
311//
312// $M(n) = O(1)$
313//
314// where $T$ is time, $M$ is additional memory, and $n$ is `xs.len()`.
315//
316// This is equivalent to `TOOM3_SQR_REC` from `mpn/generic/toom3_sqr.c`, GMP 6.2.1.
317fn limbs_square_to_out_toom_3_recursive(out: &mut [Limb], xs: &[Limb], scratch: &mut [Limb]) {
318    let n = xs.len();
319    if TOOM3_MAYBE_SQR_BASECASE && n < SQR_TOOM2_THRESHOLD {
320        limbs_square_to_out_basecase(out, xs);
321    } else if !TOOM3_MAYBE_SQR_TOOM3 || n < SQR_TOOM3_THRESHOLD {
322        limbs_square_to_out_toom_2(out, xs, scratch);
323    } else {
324        limbs_square_to_out_toom_3(out, xs, scratch);
325    }
326}
327
328// Interpreting a slices of `Limb`s as the limbs (in ascending order) of a `Natural`, writes the `2
329// * xs.len()` least-significant limbs of the square of the `Natural` to an output slice. A scratch
330// slice is provided for the algorithm to use. An upper bound for the number of scratch limbs needed
331// is provided by `limbs_square_to_out_toom_3_scratch_len`. The following restrictions on the input
332// slices must be met:
333// - `out`.len() >= 2 * `xs`.len()
334// - `xs`.len() == 3 or `xs`.len() > 4
335//
336// The smallest allowable `xs` length is 3.
337//
338// Evaluate in: -1, 0, +1, +2, +inf
339// ```
340// <-s--><--n--><--n-->
341//  ____ ______ ______
342// |xs_2|_xs_1_|_xs_0_|
343//
344// v_0     = xs_0 ^ 2                         # X(0)^2
345// v_1     = (xs_0 + xs_1 + xs_2) ^ 2         # X(1)^2    xh  <= 2
346// v_neg_1 = (xs_0 - xs_1 + xs_2) ^ 2         # X(-1)^2  |xh| <= 1
347// v_2     = (xs_0 + 2 * xs_1 + 4 * xs_2) ^ 2 # X(2)^2    xh  <= 6
348// v_inf   = xs_2 ^ 2                         # X(inf)^2
349// ```
350//
351// # Worst-case complexity
352// $T(n) = O(n^{\log_3 5}) \approx O(n^{1.465})$
353//
354// $M(n) = O(1)$
355//
356// where $T$ is time, $M$ is additional memory, and $n$ is `xs.len()`.
357//
358// # Panics
359// May panic if the input slice conditions are not met.
360//
361// This is equivalent to `mpn_toom3_sqr` from `mpn/generic/toom3_sqr.c`, GMP 6.2.1.
362pub_test! {limbs_square_to_out_toom_3(out: &mut [Limb], xs: &[Limb], scratch: &mut [Limb]) {
363    let xs_len = xs.len();
364    let n = xs_len.div_round(3, Ceiling).0;
365    let m = n + 1;
366    let k = m + n;
367    let s = xs_len - (n << 1);
368    assert_ne!(s, 0);
369    assert!(s <= n);
370    split_into_chunks!(xs, n, [xs_0, xs_1], xs_2);
371    split_into_chunks_mut!(scratch, m << 1, [scratch_lo, asm1], as1);
372    let (asm1_last, asm1_init) = asm1[..m].split_last_mut().unwrap();
373    let (as1_last, as1_init) = as1[..m].split_last_mut().unwrap();
374    let scratch_lo = &mut scratch_lo[..n];
375    let mut carry = Limb::from(limbs_add_to_out(scratch_lo, xs_0, xs_2));
376    *as1_last = carry;
377    if limbs_add_same_length_to_out(as1_init, scratch_lo, xs_1) {
378        *as1_last += 1;
379    }
380    if carry == 0 && limbs_cmp_same_length(scratch_lo, xs_1) == Less {
381        limbs_sub_same_length_to_out(asm1_init, xs_1, scratch_lo);
382    } else if limbs_sub_same_length_to_out(asm1_init, scratch_lo, xs_1) {
383        carry.wrapping_sub_assign(1);
384    }
385    *asm1_last = carry;
386    let as2 = &mut out[m..m << 1];
387    let (as2_last, as2_init) = as2.split_last_mut().unwrap();
388    let (as1_lo, as1_hi) = as1_init.split_at_mut(s);
389    let mut carry = Limb::from(limbs_add_same_length_to_out(as2_init, xs_2, as1_lo));
390    if s != n {
391        carry = Limb::from(limbs_add_limb_to_out(&mut as2_init[s..], as1_hi, carry));
392    }
393    carry.wrapping_add_assign(*as1_last);
394    carry = carry.arithmetic_checked_shl(1).unwrap();
395    carry.wrapping_add_assign(limbs_slice_shl_in_place(as2_init, 1));
396    if limbs_sub_same_length_in_place_left(as2_init, xs_0) {
397        carry.wrapping_sub_assign(1);
398    }
399    *as2_last = carry;
400    assert!(*as1_last <= 2);
401    assert!(*asm1_last <= 1);
402    let (scratch_lo, scratch_out) = scratch.split_at_mut(5 * m);
403    if SMALLER_RECURSION_TOOM_3 {
404        let (v_neg_1, asm1) = scratch_lo.split_at_mut(k);
405        let (v_neg_1_last, v_neg_1_init) = v_neg_1.split_last_mut().unwrap();
406        let (asm1_last, asm1_init) = asm1[1..n + 2].split_last().unwrap();
407        limbs_square_to_out_toom_3_recursive(v_neg_1_init, asm1_init, scratch_out);
408        *v_neg_1_last = if *asm1_last != 0 {
409            asm1_last.wrapping_add(limbs_slice_add_mul_limb_same_length_in_place_left(
410                &mut v_neg_1_init[n..],
411                asm1_init,
412                2,
413            ))
414        } else {
415            0
416        };
417    } else {
418        fail_on_untested_path("limbs_square_to_out_toom_3, !SMALLER_RECURSION");
419        let (v_neg_1, asm1) = scratch_lo.split_at_mut(m << 1);
420        limbs_square_to_out_toom_3_recursive(v_neg_1, &asm1[..m], scratch_out);
421    }
422    limbs_square_to_out_toom_3_recursive(&mut scratch_lo[k..], as2, scratch_out);
423    let v_inf = &mut out[n << 2..];
424    limbs_square_to_out_toom_3_recursive(v_inf, xs_2, scratch_out);
425    let v_inf_0 = v_inf[0];
426    let (as1, scratch_out) = &mut scratch[m << 2..].split_at_mut(m);
427    let out_hi = &mut out[n << 1..];
428    if SMALLER_RECURSION_TOOM_3 {
429        let (v_1_last, v_1_init) = out_hi[..k].split_last_mut().unwrap();
430        let (as1_last, as1_init) = as1.split_last_mut().unwrap();
431        limbs_square_to_out_toom_3_recursive(v_1_init, as1_init, scratch_out);
432        let v_1_init = &mut v_1_init[n..];
433        *v_1_last = if *as1_last == 1 {
434            limbs_slice_add_mul_limb_same_length_in_place_left(v_1_init, as1_init, 2)
435                .wrapping_add(1)
436        } else if *as1_last != 0 {
437            as1_last.arithmetic_checked_shl(1u64).unwrap().wrapping_add(
438                limbs_slice_add_mul_limb_same_length_in_place_left(v_1_init, as1_init, 4),
439            )
440        } else {
441            0
442        };
443    } else {
444        let carry = out_hi[k];
445        limbs_square_to_out_toom_3_recursive(out_hi, as1, scratch_out);
446        out_hi[k] = carry;
447    }
448    let (v_neg_1, remainder) = scratch.split_at_mut(k);
449    let (v_2, scratch_out) = remainder.split_at_mut(3 * n + 4);
450    limbs_square_to_out_toom_3_recursive(out, xs_0, scratch_out);
451    limbs_mul_toom_interpolate_5_points(out, v_2, v_neg_1, n, s << 1, false, v_inf_0);
452}}
453
454// This function can be used to determine whether the size of the input slice to
455// `limbs_square_to_out_toom_4` is valid.
456//
457// # Worst-case complexity
458// Constant time and additional memory.
459pub_const_test! {limbs_square_to_out_toom_4_input_size_valid(xs_len: usize) -> bool {
460    xs_len == 4 || xs_len == 7 || xs_len == 8 || xs_len > 9
461}}
462
463// # Worst-case complexity
464// Constant time and additional memory.
465//
466// This is equivalent to `mpn_toom4_sqr_itch` from `gmp-impl.h`, GMP 6.2.1.
467pub_const_test! {limbs_square_to_out_toom_4_scratch_len(xs_len: usize) -> usize {
468    3 * xs_len + Limb::WIDTH as usize
469}}
470
471// TODO tune
472
473// This is equivalent to `MAYBE_sqr_toom2` from `mpn/generic/toom4_sqr.c`, GMP 6.2.1.
474const TOOM4_MAYBE_SQR_TOOM2: bool =
475    TUNE_PROGRAM_BUILD || SQR_TOOM4_THRESHOLD < 4 * SQR_TOOM3_THRESHOLD;
476
477// TODO tune
478
479// This is equivalent to `MAYBE_sqr_toom4` from `mpn/generic/toom4_sqr.c`, GMP 6.2.1.
480const TOOM4_MAYBE_SQR_TOOM4: bool =
481    TUNE_PROGRAM_BUILD || SQR_TOOM6_THRESHOLD >= 4 * SQR_TOOM4_THRESHOLD;
482
483// # Worst-case complexity
484// $T(n) = O(n^{\log_4 7}) \approx O(n^{1.404})$
485//
486// $M(n) = O(1)$
487//
488// where $T$ is time, $M$ is additional memory, and $n$ is `xs.len()`.
489//
490// This is equivalent to `TOOM4_SQR_REC` from `mpn/generic/toom4_sqr.c`, GMP 6.2.1.
491fn limbs_square_to_out_toom_4_recursive(out: &mut [Limb], xs: &[Limb], scratch: &mut [Limb]) {
492    let n = xs.len();
493    if n < SQR_TOOM2_THRESHOLD {
494        // We don't check TOOM4_MAYBE_SQR_BASECASE because we never want the Toom functions to
495        // handle very small inputs.
496        limbs_square_to_out_basecase(out, xs);
497    } else if TOOM4_MAYBE_SQR_TOOM2 && n < SQR_TOOM3_THRESHOLD {
498        limbs_square_to_out_toom_2(out, xs, scratch);
499    } else if !TOOM4_MAYBE_SQR_TOOM4 || n < SQR_TOOM4_THRESHOLD {
500        limbs_square_to_out_toom_3(out, xs, scratch);
501    } else {
502        limbs_square_to_out_toom_4(out, xs, scratch);
503    }
504}
505
506// Interpreting a slices of `Limb`s as the limbs (in ascending order) of a `Natural`, writes the `2
507// * xs.len()` least-significant limbs of the square of the `Natural` to an output slice. A scratch
508// slice is provided for the algorithm to use. An upper bound for the number of scratch limbs needed
509// is provided by `limbs_square_to_out_toom_4_scratch_len`. The following restrictions on the input
510// slices must be met:
511// - `out`.len() >= 2 * `xs`.len()
512// - `xs`.len() is 4, 7, or 8, or `xs`.len() > 9.
513//
514// The smallest allowable `xs` length is 4.
515//
516// Evaluate in: -1, -1/2, 0, +1/2, +1, +2, +inf
517// ```
518// <-s--><--n--><--n--><--n-->
519//  ____ ______ ______ ______
520// |xs_3|_xs_2_|_xs_1_|_xs_0_|
521//
522// v_0     = xs_0 ^ 2                                    # X(0) ^ 2
523// v_1     = (xs_0 + xs_1 + xs_2 + xs_3) ^ 2             # X(1) ^ 2     xh <= 3
524// v_neg_1 = (xs_0 - xs_1 + xs_2 - xs_3) ^ 2             # X(-1) ^ 2    |xh| <= 1
525// v_2     = (xs_0 + 2 * xs_1 + 4 * xs_2 + 8 * xs_3) ^ 2 # X(2) ^ 2     xh <= 14
526// vh      = (8 * xs_0 + 4 * xs_1 + 2 * xs_2 + xs_3) ^ 2 # X(1/2) ^ 2   xh <= 14
527// vmh     = (8 * xs_0 - 4 * xs_1 + 2 * xs_2 - xs_3) ^ 2 # X(-1/2) ^ 2  -4 <= xh <= 9
528// v_inf   = xs_3 ^ 2                                    # X(inf) ^ 2
529// ```
530//
531// # Worst-case complexity
532// $T(n) = O(n^{\log_4 7}) \approx O(n^{1.404})$
533//
534// $M(n) = O(1)$
535//
536// where $T$ is time, $M$ is additional memory, and $n$ is `xs.len()`.
537//
538// # Panics
539// May panic if the input slice conditions are not met.
540//
541// This is equivalent to `mpn_toom4_sqr` from `mpn/generic/toom4_sqr.c`, GMP 6.2.1.
542pub_test! {limbs_square_to_out_toom_4(out: &mut [Limb], xs: &[Limb], scratch: &mut [Limb]) {
543    let xs_len = xs.len();
544    let n = (xs_len + 3) >> 2;
545    let s = xs_len - 3 * n;
546    assert_ne!(s, 0);
547    assert!(s <= n);
548    let m = n + 1;
549    let k = m + n;
550    split_into_chunks!(xs, n, [xs_0, xs_1, xs_2], xs_3);
551    // Total scratch need: 8 * n + 5 + scratch for recursive calls. This gives roughly 32 * n / 3 +
552    // log term. Compute apx = xs_0 + 2 * xs_1 + 4 * xs_2 + 8 * xs_3 and amx = xs_0 - 2 * xs_1 + 4
553    // * xs_2 - 8 * xs_3.
554    let (apx, remainder) = out.split_at_mut(n << 1);
555    let apx = &mut apx[..m];
556    let (v1, amx) = remainder.split_at_mut(m << 1);
557    let amx = &mut amx[..m];
558    let (scratch_lo, scratch_hi) = scratch.split_at_mut((k << 2) + 1);
559    limbs_mul_toom_evaluate_deg_3_poly_in_2_and_neg_2(apx, amx, xs, n, &mut scratch_hi[..m]);
560    limbs_square_to_out_toom_4_recursive(scratch_lo, apx, scratch_hi);
561    let scratch_lo = &mut scratch_lo[k..];
562    limbs_square_to_out_toom_4_recursive(scratch_lo, amx, scratch_hi);
563    // Compute apx = 8 xs_0 + 4 xs_1 + 2 xs_2 + xs_3 = (((2 xs_0 + xs_1) * 2 + xs_2) * 2 + xs_3
564    let (apx_last, apx_init) = apx.split_last_mut().unwrap();
565    let mut carry = limbs_shl_to_out(apx_init, xs_0, 1);
566    if limbs_slice_add_same_length_in_place_left(apx_init, xs_1) {
567        carry.wrapping_add_assign(1);
568    }
569    carry = carry.arithmetic_checked_shl(1).unwrap();
570    carry.wrapping_add_assign(limbs_slice_shl_in_place(apx_init, 1));
571    if limbs_slice_add_same_length_in_place_left(apx_init, xs_2) {
572        carry.wrapping_add_assign(1);
573    }
574    carry = carry.arithmetic_checked_shl(1).unwrap();
575    carry.wrapping_add_assign(limbs_slice_shl_in_place(apx_init, 1));
576    if limbs_slice_add_greater_in_place_left(apx_init, xs_3) {
577        carry.wrapping_add_assign(1);
578    }
579    *apx_last = carry;
580    assert!(*apx_last < 15);
581    let scratch_lo = &mut scratch_lo[k..];
582    limbs_square_to_out_toom_4_recursive(scratch_lo, apx, scratch_hi);
583    // Compute apx = xs_0 + xs_1 + xs_2 + xs_3 and amx = xs_0 - xs_1 + xs_2 - xs_3.
584    limbs_mul_toom_evaluate_deg_3_poly_in_1_and_neg_1(apx, amx, xs, n, &mut scratch_hi[..m]);
585    limbs_square_to_out_toom_4_recursive(v1, apx, scratch_hi);
586    let scratch_lo = &mut scratch_lo[k..];
587    limbs_square_to_out_toom_4_recursive(scratch_lo, amx, scratch_hi);
588    let (v0, vinf) = out.split_at_mut(n << 1);
589    let vinf = &mut vinf[n << 2..];
590    limbs_square_to_out_toom_4_recursive(v0, xs_0, scratch_hi);
591    limbs_square_to_out_toom_4_recursive(vinf, xs_3, scratch_hi);
592    split_into_chunks_mut!(scratch, k, [v2, vm2, vh, vm1], scratch_hi);
593    limbs_mul_toom_interpolate_7_points(out, n, s << 1, false, vm2, false, vm1, v2, vh, scratch_hi);
594}}
595
596// This function can be used to determine whether the size of the input slice to
597// `limbs_square_to_out_toom_6` is valid.
598//
599// # Worst-case complexity
600// Constant time and additional memory.
601pub_const_test! {limbs_square_to_out_toom_6_input_size_valid(xs_len: usize) -> bool {
602    xs_len == 18 || xs_len > 21 && xs_len != 25 && xs_len != 26 && xs_len != 31
603}}
604
605// # Worst-case complexity
606// Constant time and additional memory.
607//
608// This is equivalent to `mpn_toom6_sqr_itch` from `gmp-impl.h`, GMP 6.2.1.
609pub_crate_test! {limbs_square_to_out_toom_6_scratch_len(n: usize) -> usize {
610    (n << 1)
611        + max(
612            (SQR_TOOM6_THRESHOLD << 1) + usize::wrapping_from(Limb::WIDTH) * 6,
613            limbs_square_to_out_toom_4_scratch_len(SQR_TOOM6_THRESHOLD),
614        )
615        - (SQR_TOOM6_THRESHOLD << 1)
616}}
617
618// TODO tune
619
620/// This is equivalent to `SQR_TOOM6_MAX` from `mpn/generic/toom6_sqr.c`, GMP 6.2.1.
621const SQR_TOOM6_MAX: usize = (SQR_TOOM8_THRESHOLD + 6 * 2 - 1).div_ceil(6);
622
623// TODO tune
624
625/// This is equivalent to `MAYBE_sqr_basecase` from `mpn/generic/toom6_sqr.c`, GMP 6.2.1.
626const TOOM6_MAYBE_SQR_BASECASE: bool =
627    TUNE_PROGRAM_BUILD || SQR_TOOM6_THRESHOLD < 6 * SQR_TOOM2_THRESHOLD;
628
629// TODO tune
630
631/// This is equivalent to `MAYBE_sqr_toom2` from `mpn/generic/toom6_sqr.c`, GMP 6.2.1.
632const TOOM6_MAYBE_SQR_TOOM2: bool =
633    TUNE_PROGRAM_BUILD || SQR_TOOM6_THRESHOLD < 6 * SQR_TOOM3_THRESHOLD;
634
635// TODO tune
636
637/// This is equivalent to `MAYBE_sqr_above_toom2` from `mpn/generic/toom6_sqr.c`, GMP 6.2.1.
638const TOOM6_MAYBE_SQR_ABOVE_TOOM2: bool =
639    TUNE_PROGRAM_BUILD || SQR_TOOM6_MAX >= SQR_TOOM3_THRESHOLD;
640
641// TODO tune
642
643/// This is equivalent to `MAYBE_sqr_toom3` from `mpn/generic/toom6_sqr.c`, GMP 6.2.1.
644const TOOM6_MAYBE_SQR_TOOM3: bool =
645    TUNE_PROGRAM_BUILD || SQR_TOOM6_THRESHOLD < 6 * SQR_TOOM4_THRESHOLD;
646
647// TODO tune
648
649/// This is equivalent to `MAYBE_sqr_above_toom3` from `mpn/generic/toom6_sqr.c`, GMP 6.2.1.
650const TOOM6_MAYBE_SQR_ABOVE_TOOM3: bool =
651    TUNE_PROGRAM_BUILD || SQR_TOOM6_MAX >= SQR_TOOM4_THRESHOLD;
652
653// TODO tune
654
655/// This is equivalent to `MAYBE_sqr_above_toom4` from `mpn/generic/toom6_sqr.c`, GMP 6.2.1.
656const TOOM6_MAYBE_SQR_ABOVE_TOOM4: bool =
657    TUNE_PROGRAM_BUILD || SQR_TOOM6_MAX >= SQR_TOOM6_THRESHOLD;
658
659// # Worst-case complexity
660// $T(n) = O(n^{\log_6 11}) \approx O(n^{1.338})$
661//
662// $M(n) = O(1)$
663//
664// where $T$ is time, $M$ is additional memory, and $n$ is `xs.len()`.
665//
666// This is equivalent to `TOOM6_SQR_REC` from `mpn/generic/toom6_sqr.c`, GMP 6.2.1.
667fn limbs_square_to_out_toom_6_recursive(out: &mut [Limb], xs: &[Limb], scratch: &mut [Limb]) {
668    let n = xs.len();
669    if TOOM6_MAYBE_SQR_BASECASE && n < SQR_TOOM2_THRESHOLD {
670        limbs_square_to_out_basecase(out, xs);
671    } else if TOOM6_MAYBE_SQR_TOOM2 && (!TOOM6_MAYBE_SQR_ABOVE_TOOM2 || n < SQR_TOOM3_THRESHOLD) {
672        limbs_square_to_out_toom_2(out, xs, scratch);
673    } else if TOOM6_MAYBE_SQR_TOOM3 && (!TOOM6_MAYBE_SQR_ABOVE_TOOM3 || n < SQR_TOOM4_THRESHOLD) {
674        limbs_square_to_out_toom_3(out, xs, scratch);
675    } else if !TOOM6_MAYBE_SQR_ABOVE_TOOM4 || n < SQR_TOOM6_THRESHOLD {
676        limbs_square_to_out_toom_4(out, xs, scratch);
677    } else {
678        limbs_square_to_out_toom_6(out, xs, scratch);
679    }
680}
681
682// Interpreting a slices of `Limb`s as the limbs (in ascending order) of a `Natural`, writes the `2
683// * xs.len()` least-significant limbs of the square of the `Natural` to an output slice. A scratch
684// slice is provided for the algorithm to use. An upper bound for the number of scratch limbs needed
685// is provided by `limbs_square_to_out_toom_6_scratch_len`. The following restrictions on the input
686// slices must be met:
687// - `out`.len() >= 2 * `xs`.len()
688// - `xs`.len() is 18, or `xs.len()` > 21 but `xs`.len() is not 25, 26, or 31.
689//
690// The smallest allowable `xs` length is 18.
691//
692// # Worst-case complexity
693// $T(n) = O(n^{\log_6 11}) \approx O(n^{1.338})$
694//
695// $M(n) = O(1)$
696//
697// where $T$ is time, $M$ is additional memory, and $n$ is `xs.len()`.
698//
699// # Panics
700// May panic if the input slice conditions are not met.
701//
702// This is equivalent to `mpn_toom6_sqr` from `mpn/generic/toom6_sqr.c`, GMP 6.2.1.
703pub_test! {limbs_square_to_out_toom_6(out: &mut [Limb], xs: &[Limb], scratch: &mut [Limb]) {
704    let xs_len = xs.len();
705    assert!(xs_len >= 18);
706    let n = 1 + (xs_len - 1) / 6;
707    assert!(xs_len > 5 * n);
708    let s = xs_len - 5 * n;
709    assert!(s <= n);
710    assert!(10 * n + 3 <= xs_len << 1);
711    let m = n + 1;
712    let k = m + n;
713    let (out_lo, remainder) = out.split_at_mut(3 * n);
714    let (r4, r2) = remainder.split_at_mut(n << 2);
715    let (v0, v2) = r2.split_at_mut(m << 1);
716    let v0 = &mut v0[..m];
717    let v2 = &mut v2[..m];
718    // +/- 1/2
719    limbs_mul_toom_evaluate_poly_in_2_pow_neg_and_neg_2_pow_neg(
720        v2,
721        v0,
722        5,
723        xs,
724        n,
725        1,
726        &mut out_lo[..m],
727    );
728    split_into_chunks_mut!(scratch, 3 * n + 1, [r5, r3, r1], wse);
729    limbs_square_to_out_toom_6_recursive(out_lo, v0, wse); // X(-1/2) ^ 2 * 2 ^
730    limbs_square_to_out_toom_6_recursive(r5, v2, wse); // X(1/2) ^ 2 * 2 ^
731    limbs_toom_couple_handling(r5, &mut out_lo[..k], false, n, 1, 0);
732    // +/- 1
733    limbs_mul_toom_evaluate_poly_in_1_and_neg_1(v2, v0, 5, xs, n, &mut out_lo[..m]);
734    limbs_square_to_out_toom_6_recursive(out_lo, v0, wse); // X(-1) ^ 2
735    limbs_square_to_out_toom_6_recursive(r3, v2, wse); // X(1) ^ 2
736    limbs_toom_couple_handling(r3, &mut out_lo[..k], false, n, 0, 0);
737    // +/- 4
738    limbs_mul_toom_evaluate_poly_in_2_pow_and_neg_2_pow(v2, v0, 5, xs, n, 2, &mut out_lo[..m]);
739    limbs_square_to_out_toom_6_recursive(out_lo, v0, wse); // X(-4) ^ 2
740    limbs_square_to_out_toom_6_recursive(r1, v2, wse); // X(4) ^ 2
741    limbs_toom_couple_handling(r1, &mut out_lo[..k], false, n, 2, 4);
742    // +/- 1/4
743    limbs_mul_toom_evaluate_poly_in_2_pow_neg_and_neg_2_pow_neg(
744        v2,
745        v0,
746        5,
747        xs,
748        n,
749        2,
750        &mut out_lo[..m],
751    );
752    limbs_square_to_out_toom_6_recursive(out_lo, v0, wse); // X(-1/4) ^ 2 * 4 ^
753    limbs_square_to_out_toom_6_recursive(r4, v2, wse); // X(1/4) ^ 2 * 4 ^
754    limbs_toom_couple_handling(r4, &mut out_lo[..k], false, n, 2, 0);
755    // +/- 2
756    limbs_mul_toom_evaluate_poly_in_2_and_neg_2(v2, v0, 5, xs, n, &mut out_lo[..m]);
757    limbs_square_to_out_toom_6_recursive(out_lo, v0, wse); // X(-2) ^ 2
758    let (v0, v2) = r2.split_at_mut(m << 1);
759    limbs_square_to_out_toom_6_recursive(v0, &v2[..m], wse); // X(2) ^ 2
760    limbs_toom_couple_handling(r2, &mut out_lo[..k], false, n, 1, 2);
761    limbs_square_to_out_toom_6_recursive(out_lo, &xs[..n], wse); // X(0) ^ 2
762    limbs_mul_toom_interpolate_12_points(out, r1, r3, r5, n, s << 1, false, wse);
763}}
764
765// TODO tune
766pub(crate) const SQR_FFT_THRESHOLD: usize = SQR_FFT_MODF_THRESHOLD * 10;
767
768// This function can be used to determine whether the size of the input slice to
769// `limbs_square_to_out_toom_8` is valid.
770//
771// # Worst-case complexity
772// Constant time and additional memory.
773pub_const_test! {limbs_square_to_out_toom_8_input_size_valid(xs_len: usize) -> bool {
774    xs_len == 40 || xs_len > 43 && xs_len != 49 && xs_len != 50 && xs_len != 57
775}}
776
777// # Worst-case complexity
778// Constant time and additional memory.
779//
780// This is equivalent to `mpn_toom8_sqr_itch` from `gmp-impl.h`, GMP 6.2.1.
781pub_crate_test! {limbs_square_to_out_toom_8_scratch_len(n: usize) -> usize {
782    ((n * 15) >> 3)
783        + max(
784            ((SQR_TOOM8_THRESHOLD * 15) >> 3) + usize::wrapping_from(Limb::WIDTH) * 6,
785            limbs_square_to_out_toom_6_scratch_len(SQR_TOOM8_THRESHOLD),
786        )
787        - ((SQR_TOOM8_THRESHOLD * 15) >> 3)
788}}
789
790// TODO tune
791
792/// This is equivalent to `SQR_TOOM8_MAX` from `mpn/generic/toom8_sqr.c`, GMP 6.2.1.
793const SQR_TOOM8_MAX: usize = if SQR_FFT_THRESHOLD <= usize::MAX - (8 * 2 - 1 + 7) {
794    (SQR_FFT_THRESHOLD + 8 * 2 - 1).div_ceil(8)
795} else {
796    usize::MAX
797};
798
799// TODO tune
800
801/// This is equivalent to `MAYBE_sqr_basecase` from `mpn/generic/toom8_sqr.c`, GMP 6.2.1.
802const TOOM8_MAYBE_SQR_BASECASE: bool =
803    TUNE_PROGRAM_BUILD || SQR_TOOM8_THRESHOLD < 8 * SQR_TOOM2_THRESHOLD;
804
805// TODO tune
806
807/// This is equivalent to `MAYBE_sqr_above_basecase` from `mpn/generic/toom8_sqr.c`, GMP 6.2.1.
808const TOOM8_MAYBE_SQR_ABOVE_BASECASE: bool =
809    TUNE_PROGRAM_BUILD || SQR_TOOM8_MAX >= SQR_TOOM2_THRESHOLD;
810
811// TODO tune
812
813/// This is equivalent to `MAYBE_sqr_toom2` from `mpn/generic/toom8_sqr.c`, GMP 6.2.1.
814const TOOM8_MAYBE_SQR_TOOM2: bool =
815    TUNE_PROGRAM_BUILD || SQR_TOOM8_THRESHOLD < 8 * SQR_TOOM3_THRESHOLD;
816
817// TODO tune
818
819/// This is equivalent to `MAYBE_sqr_above_toom2` from `mpn/generic/toom8_sqr.c`, GMP 6.2.1.
820const TOOM8_MAYBE_SQR_ABOVE_TOOM2: bool =
821    TUNE_PROGRAM_BUILD || SQR_TOOM8_MAX >= SQR_TOOM3_THRESHOLD;
822
823// TODO tune
824
825/// This is equivalent to `MAYBE_sqr_toom3` from `mpn/generic/toom8_sqr.c`, GMP 6.2.1.
826const TOOM8_MAYBE_SQR_TOOM3: bool =
827    TUNE_PROGRAM_BUILD || SQR_TOOM8_THRESHOLD < 8 * SQR_TOOM4_THRESHOLD;
828
829// TODO tune
830
831/// This is equivalent to `MAYBE_sqr_above_toom3` from `mpn/generic/toom8_sqr.c`, GMP 6.2.1.
832const TOOM8_MAYBE_SQR_ABOVE_TOOM3: bool =
833    TUNE_PROGRAM_BUILD || SQR_TOOM8_MAX >= SQR_TOOM4_THRESHOLD;
834
835// TODO tune
836
837/// This is equivalent to `MAYBE_sqr_toom4` from `mpn/generic/toom8_sqr.c`, GMP 6.2.1.
838const TOOM8_MAYBE_SQR_TOOM4: bool =
839    TUNE_PROGRAM_BUILD || SQR_TOOM8_THRESHOLD < 8 * SQR_TOOM6_THRESHOLD;
840
841// TODO tune
842
843/// This is equivalent to `MAYBE_sqr_above_toom4` from `mpn/generic/toom8_sqr.c`, GMP 6.2.1.
844const TOOM8_MAYBE_SQR_ABOVE_TOOM4: bool =
845    TUNE_PROGRAM_BUILD || SQR_TOOM8_MAX >= SQR_TOOM6_THRESHOLD;
846
847// TODO tune
848
849/// This is equivalent to `MAYBE_sqr_above_toom6` from `mpn/generic/toom8_sqr.c`, GMP 6.2.1.
850const TOOM8_MAYBE_SQR_ABOVE_TOOM6: bool =
851    TUNE_PROGRAM_BUILD || SQR_TOOM8_MAX >= SQR_TOOM8_THRESHOLD;
852
853// # Worst-case complexity
854// $T(n) = O(n^{\log_6 11}) \approx O(n^{1.302})$
855//
856// $M(n) = O(1)$
857//
858// where $T$ is time, $M$ is additional memory, and $n$ is `xs.len()`.
859//
860// This is equivalent to `TOOM8_SQR_REC` from `mpn/generic/toom8_sqr.c`, GMP 6.2.1, when `f` is
861// `false`.
862fn limbs_square_to_out_toom_8_recursive(out: &mut [Limb], xs: &[Limb], scratch: &mut [Limb]) {
863    let n = xs.len();
864    if TOOM8_MAYBE_SQR_BASECASE && (!TOOM8_MAYBE_SQR_ABOVE_BASECASE || n < SQR_TOOM2_THRESHOLD) {
865        limbs_square_to_out_basecase(out, xs);
866    } else if TOOM8_MAYBE_SQR_TOOM2 && (!TOOM8_MAYBE_SQR_ABOVE_TOOM2 || n < SQR_TOOM3_THRESHOLD) {
867        limbs_square_to_out_toom_2(out, xs, scratch);
868    } else if TOOM8_MAYBE_SQR_TOOM3 && (!TOOM8_MAYBE_SQR_ABOVE_TOOM3 || n < SQR_TOOM4_THRESHOLD) {
869        limbs_square_to_out_toom_3(out, xs, scratch);
870    } else if TOOM8_MAYBE_SQR_TOOM4 && (!TOOM8_MAYBE_SQR_ABOVE_TOOM4 || n < SQR_TOOM6_THRESHOLD) {
871        limbs_square_to_out_toom_4(out, xs, scratch);
872    } else if !TOOM8_MAYBE_SQR_ABOVE_TOOM6 || n < SQR_TOOM8_THRESHOLD {
873        limbs_square_to_out_toom_6(out, xs, scratch);
874    } else {
875        limbs_square_to_out_toom_8(out, xs, scratch);
876    }
877}
878
879// Interpreting a slices of `Limb`s as the limbs (in ascending order) of a `Natural`, writes the `2
880// * xs.len()` least-significant limbs of the square of the `Natural` to an output slice. A scratch
881// slice is provided for the algorithm to use. An upper bound for the number of scratch limbs needed
882// is provided by `limbs_square_to_out_toom_8_scratch_len`. The following restrictions on the input
883// slices must be met:
884// - `out`.len() >= 2 * `xs`.len()
885// - `xs`.len() is 40, or `xs.len()` > 43 but `xs`.len() is not 49, 50, or 57.
886//
887// The smallest allowable `xs` length is 40.
888//
889// # Worst-case complexity
890// $T(n) = O(n^{\log_6 11}) \approx O(n^{1.302})$
891//
892// $M(n) = O(1)$
893//
894// where $T$ is time, $M$ is additional memory, and $n$ is `xs.len()`.
895//
896// # Panics
897// May panic if the input slice conditions are not met.
898//
899// This is equivalent to `mpn_toom8_sqr` from `mpn/generic/toom8_sqr.c`, GMP 6.2.1.
900pub_test! {limbs_square_to_out_toom_8(out: &mut [Limb], xs: &[Limb], scratch: &mut [Limb]) {
901    let xs_len = xs.len();
902    assert!(xs_len >= 40);
903    let n: usize = xs_len.shr_round(3, Ceiling).0;
904    let m = n + 1;
905    let k = m + n;
906    let p = k + n;
907    assert!(xs_len > 7 * n);
908    let s = xs_len - 7 * n;
909    assert!(s <= n);
910    assert!(s << 1 > 3);
911    let (pp_lo, remainder) = out.split_at_mut(3 * n);
912    split_into_chunks_mut!(remainder, n << 2, [r6, r4], pp_hi);
913    split_into_chunks_mut!(pp_hi, m, [v0, _unused, v2], _unused);
914    // +/- 1/8
915    limbs_mul_toom_evaluate_poly_in_2_pow_neg_and_neg_2_pow_neg(
916        v2,
917        v0,
918        7,
919        xs,
920        n,
921        3,
922        &mut pp_lo[..m],
923    );
924    let (r7_r5, remainder) = scratch.split_at_mut(p << 1);
925    let (r3, r1_wse) = remainder.split_at_mut(p);
926    let (r1, wse) = r1_wse.split_at_mut(p);
927    // A(-1/8) * B(-1/8) * 8 ^, A(1/8) * B(1/8) * 8 ^
928    limbs_square_to_out_toom_8_recursive(pp_lo, v0, wse);
929    limbs_square_to_out_toom_8_recursive(r7_r5, v2, wse);
930    let limit = if BIT_CORRECTION { m << 1 } else { k };
931    limbs_toom_couple_handling(r7_r5, &mut pp_lo[..limit], false, n, 3, 0);
932    // +/- 1/4
933    limbs_mul_toom_evaluate_poly_in_2_pow_neg_and_neg_2_pow_neg(
934        v2,
935        v0,
936        7,
937        xs,
938        n,
939        2,
940        &mut pp_lo[..m],
941    );
942    // A(-1/4) * B(-1/4) * 4 ^, A(1/4) * B(1/4) * 4^
943    limbs_square_to_out_toom_8_recursive(pp_lo, v0, wse);
944    let (r7, r5) = r7_r5.split_at_mut(p);
945    limbs_square_to_out_toom_8_recursive(r5, v2, wse);
946    limbs_toom_couple_handling(r5, &mut pp_lo[..k], false, n, 2, 0);
947    // +/- 2
948    limbs_mul_toom_evaluate_poly_in_2_and_neg_2(v2, v0, 7, xs, n, &mut pp_lo[..m]);
949    // A(-2)*B(-2), A(+2)*B(+2)
950    limbs_square_to_out_toom_8_recursive(pp_lo, v0, wse);
951    limbs_square_to_out_toom_8_recursive(r3, v2, wse);
952    limbs_toom_couple_handling(r3, &mut pp_lo[..k], false, n, 1, 2);
953    // +/- 8
954    limbs_mul_toom_evaluate_poly_in_2_pow_and_neg_2_pow(v2, v0, 7, xs, n, 3, &mut pp_lo[..m]);
955    // A(-8) * B(-8), A(8) * B(8)
956    limbs_square_to_out_toom_8_recursive(pp_lo, v0, wse);
957    limbs_square_to_out_toom_8_recursive(r1, v2, wse);
958    limbs_toom_couple_handling(r1_wse, &mut pp_lo[..limit], false, n, 3, 6);
959    // +/- 1/2
960    limbs_mul_toom_evaluate_poly_in_2_pow_neg_and_neg_2_pow_neg(
961        v2,
962        v0,
963        7,
964        xs,
965        n,
966        1,
967        &mut pp_lo[..m],
968    );
969    // A(-1/2) * B(-1/2) * 2 ^, A(1/2) * B(1/2) * 2 ^
970    let (r1, wse) = r1_wse.split_at_mut(p);
971    limbs_square_to_out_toom_8_recursive(pp_lo, v0, wse);
972    limbs_square_to_out_toom_8_recursive(r6, v2, wse);
973    limbs_toom_couple_handling(r6, &mut pp_lo[..k], false, n, 1, 0);
974    // +/- 1
975    limbs_mul_toom_evaluate_poly_in_1_and_neg_1(v2, v0, 7, xs, n, &mut pp_lo[..m]);
976    // A(-1) * B(-1), A(1) * B(1)
977    limbs_square_to_out_toom_8_recursive(pp_lo, v0, wse);
978    limbs_square_to_out_toom_8_recursive(r4, v2, wse);
979    limbs_toom_couple_handling(r4, &mut pp_lo[..k], false, n, 0, 0);
980    // +/- 4
981    limbs_mul_toom_evaluate_poly_in_2_pow_and_neg_2_pow(v2, v0, 7, xs, n, 2, &mut pp_lo[..m]);
982    // A(-4) * B(-4), A(4) * B(4)
983    limbs_square_to_out_toom_8_recursive(pp_lo, v0, wse);
984    let (r2, v2) = pp_hi.split_at_mut(m << 1);
985    limbs_square_to_out_toom_8_recursive(r2, &v2[..m], wse);
986    limbs_toom_couple_handling(pp_hi, &mut pp_lo[..k], false, n, 2, 4);
987    // A(0) * B(0)
988    limbs_square_to_out_toom_8_recursive(pp_lo, &xs[..n], wse);
989    limbs_mul_toom_interpolate_16_points(out, r1, r3, r5, r7, n, s << 1, false, &mut wse[..p]);
990}}
991
992pub_crate_test! {
993#[allow(clippy::absurd_extreme_comparisons)]
994limbs_square_to_out_scratch_len(n: usize) -> usize {
995    if n < SQR_BASECASE_THRESHOLD || n < SQR_TOOM2_THRESHOLD {
996        0
997    } else if n < SQR_TOOM3_THRESHOLD {
998        limbs_square_to_out_toom_2_scratch_len(n)
999    } else if n < SQR_TOOM4_THRESHOLD {
1000        limbs_square_to_out_toom_3_scratch_len(n)
1001    } else if n < SQR_TOOM6_THRESHOLD {
1002        limbs_square_to_out_toom_4_scratch_len(n)
1003    } else if n < SQR_TOOM8_THRESHOLD {
1004        limbs_square_to_out_toom_6_scratch_len(n)
1005    } else if n < SQR_FFT_THRESHOLD {
1006        limbs_square_to_out_toom_8_scratch_len(n)
1007    } else {
1008        0
1009    }
1010}}
1011
1012// # Worst-case complexity
1013// $T(n) = O(n \log n \log\log n)$
1014//
1015// $M(n) = O(n \log n)$
1016//
1017// where $T$ is time, $M$ is additional memory, and $n$ is `xs.len()`.
1018//
1019// This is equivalent to `mpn_sqr` from `mpn/generic/sqr.c`, GMP 6.2.1.
1020pub_crate_test! {
1021#[allow(clippy::absurd_extreme_comparisons)]
1022limbs_square_to_out(out: &mut [Limb], xs: &[Limb], scratch: &mut [Limb]) {
1023    let n = xs.len();
1024    assert!(n >= 1);
1025    if n < SQR_BASECASE_THRESHOLD {
1026        // limbs_mul_greater_to_out_basecase is faster than limbs_square_to_out_basecase on small
1027        // sizes sometimes
1028        limbs_mul_greater_to_out_basecase(out, xs, xs);
1029    } else if n < SQR_TOOM2_THRESHOLD {
1030        limbs_square_to_out_basecase(out, xs);
1031    } else if n < SQR_TOOM3_THRESHOLD {
1032        limbs_square_to_out_toom_2(out, xs, scratch);
1033    } else if n < SQR_TOOM4_THRESHOLD {
1034        limbs_square_to_out_toom_3(out, xs, scratch);
1035    } else if n < SQR_TOOM6_THRESHOLD {
1036        limbs_square_to_out_toom_4(out, xs, scratch);
1037    } else if n < SQR_TOOM8_THRESHOLD {
1038        limbs_square_to_out_toom_6(out, xs, scratch);
1039    } else if n < SQR_FFT_THRESHOLD {
1040        limbs_square_to_out_toom_8(out, xs, scratch);
1041    } else {
1042        mpn_square_default_mpn_ctx(out, xs);
1043    }
1044}}
1045
1046pub_crate_test! {limbs_square(xs: &[Limb]) -> Vec<Limb> {
1047    let mut out = vec![0; xs.len() << 1];
1048    let mut square_scratch = vec![0; limbs_square_to_out_scratch_len(xs.len())];
1049    limbs_square_to_out(&mut out, xs, &mut square_scratch);
1050    out
1051}}
1052
1053impl Square for Natural {
1054    type Output = Self;
1055
1056    /// Squares a [`Natural`], taking it by value.
1057    ///
1058    /// $$
1059    /// f(x) = x^2.
1060    /// $$
1061    ///
1062    /// # Worst-case complexity
1063    /// $T(n) = O(n \log n \log\log n)$
1064    ///
1065    /// $M(n) = O(n \log n)$
1066    ///
1067    /// where $T$ is time, $M$ is additional memory, and $n$ is `self.significant_bits()`.
1068    ///
1069    /// # Examples
1070    /// ```
1071    /// use malachite_base::num::arithmetic::traits::Square;
1072    /// use malachite_base::num::basic::traits::Zero;
1073    /// use malachite_nz::natural::Natural;
1074    ///
1075    /// assert_eq!(Natural::ZERO.square(), 0);
1076    /// assert_eq!(Natural::from(123u32).square(), 15129);
1077    /// ```
1078    #[inline]
1079    fn square(mut self) -> Self {
1080        self.square_assign();
1081        self
1082    }
1083}
1084
1085impl Square for &Natural {
1086    type Output = Natural;
1087
1088    /// Squares a [`Natural`], taking it by reference.
1089    ///
1090    /// $$
1091    /// f(x) = x^2.
1092    /// $$
1093    ///
1094    /// # Worst-case complexity
1095    /// $T(n) = O(n \log n \log\log n)$
1096    ///
1097    /// $M(n) = O(n \log n)$
1098    ///
1099    /// where $T$ is time, $M$ is additional memory, and $n$ is `self.significant_bits()`.
1100    ///
1101    /// # Examples
1102    /// ```
1103    /// use malachite_base::num::arithmetic::traits::Square;
1104    /// use malachite_base::num::basic::traits::Zero;
1105    /// use malachite_nz::natural::Natural;
1106    ///
1107    /// assert_eq!((&Natural::ZERO).square(), 0);
1108    /// assert_eq!((&Natural::from(123u32)).square(), 15129);
1109    /// ```
1110    #[inline]
1111    fn square(self) -> Natural {
1112        match self {
1113            &Natural::ZERO | &Natural::ONE => self.clone(),
1114            Natural(Small(x)) => Natural({
1115                let (upper, lower) = Limb::x_mul_y_to_zz(*x, *x);
1116                if upper == 0 {
1117                    Small(lower)
1118                } else {
1119                    Large(vec![lower, upper])
1120                }
1121            }),
1122            Natural(Large(xs)) => Natural::from_owned_limbs_asc(limbs_square(xs)),
1123        }
1124    }
1125}
1126
1127impl SquareAssign for Natural {
1128    /// Squares a [`Natural`] in place.
1129    ///
1130    /// $$
1131    /// x \gets x^2.
1132    /// $$
1133    ///
1134    /// # Worst-case complexity
1135    /// $T(n) = O(n \log n \log\log n)$
1136    ///
1137    /// $M(n) = O(n \log n)$
1138    ///
1139    /// where $T$ is time, $M$ is additional memory, and $n$ is `self.significant_bits()`.
1140    ///
1141    /// # Examples
1142    /// ```
1143    /// use malachite_base::num::arithmetic::traits::SquareAssign;
1144    /// use malachite_base::num::basic::traits::Zero;
1145    /// use malachite_nz::natural::Natural;
1146    ///
1147    /// let mut x = Natural::ZERO;
1148    /// x.square_assign();
1149    /// assert_eq!(x, 0);
1150    ///
1151    /// let mut x = Natural::from(123u32);
1152    /// x.square_assign();
1153    /// assert_eq!(x, 15129);
1154    /// ```
1155    fn square_assign(&mut self) {
1156        match self {
1157            &mut (Self::ZERO | Self::ONE) => {}
1158            Self(Small(x)) => {
1159                let (upper, lower) = Limb::x_mul_y_to_zz(*x, *x);
1160                if upper == 0 {
1161                    *x = lower;
1162                } else {
1163                    *self = Self(Large(vec![lower, upper]));
1164                }
1165            }
1166            Self(Large(xs)) => {
1167                *xs = limbs_square(xs);
1168                self.trim();
1169            }
1170        }
1171    }
1172}