#include "longlong.h"
#include "radix.h"
ulong
radix_divrem_two(nn_ptr res, nn_srcptr a, slong an, const radix_t radix)
{
slong i;
ulong q, r, hi, lo, B = LIMB_RADIX(radix);
q = a[an - 1] >> 1;
r = a[an - 1] & 1;
res[an - 1] = q;
for (i = an - 2; i >= 0; i--)
{
umul_ppmm(hi, lo, r, B);
add_ssaaaa(hi, lo, hi, lo, 0, a[i]);
q = (hi << (FLINT_BITS - 1)) | (lo >> 1);
r = lo & 1;
res[i] = q;
}
return r;
}
ulong
radix_divrem_1(nn_ptr res, nn_srcptr a, slong an, ulong d, const radix_t radix)
{
slong i;
ulong q, r, hi, lo, B = LIMB_RADIX(radix);
q = a[an - 1] / d;
r = a[an - 1] % d;
res[an - 1] = q;
for (i = an - 2; i >= 0; i--)
{
umul_ppmm(hi, lo, r, B);
add_ssaaaa(hi, lo, hi, lo, 0, a[i]);
udiv_qrnnd(q, r, hi, lo, d);
res[i] = q;
}
return r;
}
void
radix_divexact_1(nn_ptr res, nn_srcptr a, slong an, ulong d, const radix_t radix)
{
ulong r;
r = radix_divrem_1(res, a, an, d, radix);
FLINT_ASSERT(r == 0);
(void) r;
}
void
radix_divrem_via_mpn(nn_ptr q, nn_ptr r, nn_srcptr a, slong an, nn_srcptr b, slong bn, const radix_t radix)
{
nn_ptr bq, br, ba, bb, tmp;
slong bqn, brn, ban, bbn, alloc;
TMP_INIT;
FLINT_ASSERT(an >= bn);
FLINT_ASSERT(bn >= 1);
FLINT_ASSERT(b[bn - 1] != 0);
if (flint_mpn_zero_p(a + bn, an - bn) && mpn_cmp(a, b, bn) < 0)
{
if (r != NULL)
flint_mpn_copyi(r, a, bn);
if (q != NULL)
flint_mpn_zero(q, an - bn + 1);
return;
}
alloc = ((an - bn + 2) + (bn + 1) + (an + 1) + (bn + 1));
TMP_START;
tmp = TMP_ALLOC(alloc * sizeof(ulong));
bq = tmp;
br = bq + (an - bn + 2);
ba = br + (bn + 1);
bb = ba + (an + 1);
ban = radix_get_mpn(ba, a, an, radix);
bbn = radix_get_mpn(bb, b, bn, radix);
bqn = ban - bbn + 1;
brn = bbn;
FLINT_ASSERT(ban >= bbn);
mpn_tdiv_qr(bq, br, 0, ba, ban, bb, bbn);
slong qn, rn, qn2, rn2;
nn_ptr tmp2, q2, r2;
if (q != NULL)
{
qn2 = radix_set_mpn_need_alloc(bqn, radix);
tmp2 = TMP_ALLOC(qn2 * sizeof(ulong));
q2 = tmp2;
qn = radix_set_mpn(q2, bq, bqn, radix);
if (r != NULL)
{
radix_mulmid(br, b, bn, q2, qn, 0, bn, radix);
radix_sub(r, a, bn, br, bn, radix);
}
flint_mpn_copyi(q, q2, qn);
flint_mpn_zero(q + qn, (an - bn + 1) - qn);
}
else
{
rn2 = radix_set_mpn_need_alloc(brn, radix);
tmp2 = TMP_ALLOC(rn2 * sizeof(ulong));
r2 = tmp2;
rn = radix_set_mpn(r2, br, brn, radix);
flint_mpn_copyi(r, r2, rn);
flint_mpn_zero(r + rn, bn - rn);
}
TMP_END;
}
void
radix_inv_approx_basecase(nn_ptr Q, nn_srcptr A, slong An, slong n, const radix_t radix)
{
nn_ptr U;
nn_srcptr V;
slong Un, Vn;
slong guard_limbs;
TMP_INIT;
guard_limbs = 1;
Vn = FLINT_MIN(An, n + guard_limbs);
Un = Vn + n + 1;
TMP_START;
U = TMP_ALLOC(Un * sizeof(ulong));
V = A + An - Vn;
flint_mpn_zero(U, Un - 1);
U[Un - 1] = 1;
if (Vn == 1)
radix_divrem_1(Q, U, Un, V[0], radix);
else
radix_divrem_via_mpn(Q, NULL, U, Un, V, Vn, radix);
TMP_END;
}
#define RADIX_INV_NEWTON_CUTOFF 8
void
radix_inv_approx(nn_ptr Q, nn_srcptr A, slong An, slong n, const radix_t radix)
{
if (n <= RADIX_INV_NEWTON_CUTOFF)
{
radix_inv_approx_basecase(Q, A, An, n, radix);
}
else
{
nn_ptr U, V, T, Vhigh, Uhigh;
nn_srcptr Ahigh;
slong m, Uhighn, Tn, Ahighn, Vhighn;
int Unegative;
TMP_INIT;
m = (n + 1) / 2 + 1 + (LIMB_RADIX(radix) <= 3);
radix_inv_approx(Q + n - m, A, An, m, radix);
TMP_START;
T = Q + n - m;
Tn = m + 1 + (T[m + 1] != 0);
flint_mpn_zero(Q, n - m);
slong control_limb = n / 2 + 2;
slong A_low_zeroes = 0;
if (An >= n + 1)
{
Ahighn = n + 1;
Ahigh = A + An - Ahighn;
}
else
{
Ahighn = An;
Ahigh = A;
A_low_zeroes = n + 1 - An;
}
slong low_trunc_without_mulhigh = m + 1;
slong low_trunc_with_mulhigh = m - 1;
if (LIMB_RADIX(radix) > 2 * (m + 2) && A_low_zeroes <= low_trunc_with_mulhigh)
{
U = TMP_ALLOC((2 + (control_limb + 1)) * sizeof(ulong));
radix_mulmid(U, Ahigh, Ahighn, T, Tn,
low_trunc_with_mulhigh - A_low_zeroes,
low_trunc_with_mulhigh - A_low_zeroes + 2 + (control_limb + 1),
radix);
Uhigh = U + 2;
}
else if (A_low_zeroes <= low_trunc_without_mulhigh)
{
U = TMP_ALLOC((low_trunc_without_mulhigh - A_low_zeroes + (control_limb) + 1) * sizeof(ulong));
radix_mulmid(U, Ahigh, Ahighn, T, Tn, 0,
low_trunc_without_mulhigh - A_low_zeroes + (control_limb) + 1,
radix);
Uhigh = U + low_trunc_without_mulhigh - A_low_zeroes;
}
else
{
U = TMP_ALLOC((control_limb + 1) * sizeof(ulong));
radix_mulmid(U + A_low_zeroes - low_trunc_without_mulhigh,
Ahigh, Ahighn, T, Tn, 0,
control_limb + 1 - (A_low_zeroes - low_trunc_without_mulhigh),
radix);
flint_mpn_zero(U, A_low_zeroes - low_trunc_without_mulhigh);
Uhigh = U;
}
Unegative = (Uhigh[control_limb] != 0);
if (Unegative)
radix_neg(Uhigh, Uhigh, control_limb, radix);
Uhighn = control_limb;
while (Uhighn > 0 && Uhigh[Uhighn - 1] == 0)
Uhighn--;
if (Uhighn != 0)
{
if (LIMB_RADIX(radix) > 2 * (m + 2))
{
V = TMP_ALLOC((Tn + Uhighn - (m - 2)) * sizeof(ulong));
radix_mulmid(V, T, Tn, Uhigh, Uhighn, m - 2, Tn + Uhighn, radix);
Vhigh = V + 2;
Vhighn = Tn + Uhighn - (m - 2) - 2;
}
else
{
V = TMP_ALLOC((Tn + Uhighn) * sizeof(ulong));
radix_mul(V, T, Tn, Uhigh, Uhighn, radix);
Vhigh = V + m;
Vhighn = Tn + Uhighn - m;
}
if (Unegative)
radix_add(Q, Q, n + 2, Vhigh, Vhighn, radix);
else
radix_sub(Q, Q, n + 2, Vhigh, Vhighn, radix);
}
TMP_END;
}
}
void
radix_div_approx_invmul(nn_ptr Q, nn_srcptr B, slong Bn, nn_srcptr A, slong An, slong n, const radix_t radix)
{
nn_ptr T;
TMP_INIT;
TMP_START;
if (Bn > n)
{
B = B + Bn - n;
Bn = n;
}
T = TMP_ALLOC((Bn + n + 2) * sizeof(ulong));
radix_inv_approx(Q, A, An, n, radix);
radix_mul(T, Q, n + 2, B, Bn, radix);
flint_mpn_copyi(Q, T + Bn, n + 2);
TMP_END;
}
void
radix_div_approx(nn_ptr Q, nn_srcptr B, slong Bn, nn_srcptr A, slong An, slong n, const radix_t radix)
{
nn_ptr U, V, T, Vhigh, Uhigh;
nn_srcptr Ahigh;
slong m, Uhighn, Tn, Ahighn, Vhighn;
int Unegative;
TMP_INIT;
if (n <= 4)
{
radix_div_approx_invmul(Q, B, Bn, A, An, n, radix);
return;
}
m = (n + 1) / 2 + 1 + (LIMB_RADIX(radix) <= 3);
radix_inv_approx(Q + n - m, A, An, m, radix);
TMP_START;
T = Q + n - m;
Tn = m + 1 + (T[m + 1] != 0);
flint_mpn_zero(Q, n - m);
nn_ptr TB, TBhigh;
slong Bn2 = FLINT_MIN(m, Bn);
nn_srcptr B2 = B + Bn - Bn2;
if (LIMB_RADIX(radix) > 2 * (m + 2) && Bn2 > 2)
{
TB = TMP_ALLOC((Tn + 2) * sizeof(ulong));
radix_mulmid(TB, T, Tn, B2, Bn2, Bn2 - 2, Bn2 + Tn, radix);
TBhigh = TB + 2;
}
else
{
TB = TMP_ALLOC((2 * m + 2) * sizeof(ulong));
radix_mul(TB, T, Tn, B2, Bn2, radix);
TBhigh = TB + Bn2;
}
slong control_limb = n / 2 + 2;
slong A_low_zeroes = 0;
if (An >= n + 1)
{
Ahighn = n + 1;
Ahigh = A + An - Ahighn;
}
else
{
Ahighn = An;
Ahigh = A;
A_low_zeroes = n + 1 - An;
}
slong low_trunc_without_mulhigh = m + 1;
slong low_trunc_with_mulhigh = m - 1;
if (LIMB_RADIX(radix) > 2 * (m + 2) && A_low_zeroes <= low_trunc_with_mulhigh)
{
U = TMP_ALLOC((2 + (control_limb + 1)) * sizeof(ulong));
radix_mulmid(U, Ahigh, Ahighn, TBhigh, Tn,
low_trunc_with_mulhigh - A_low_zeroes,
low_trunc_with_mulhigh - A_low_zeroes + 2 + (control_limb + 1),
radix);
Uhigh = U + 2;
}
else if (A_low_zeroes <= low_trunc_without_mulhigh)
{
U = TMP_ALLOC((low_trunc_without_mulhigh - A_low_zeroes + (control_limb) + 1) * sizeof(ulong));
radix_mulmid(U, Ahigh, Ahighn, TBhigh, Tn, 0,
low_trunc_without_mulhigh - A_low_zeroes + (control_limb) + 1,
radix);
Uhigh = U + low_trunc_without_mulhigh - A_low_zeroes;
}
else
{
U = TMP_ALLOC((control_limb + 1) * sizeof(ulong));
radix_mulmid(U + A_low_zeroes - low_trunc_without_mulhigh,
Ahigh, Ahighn, TBhigh, Tn, 0,
control_limb + 1 - (A_low_zeroes - low_trunc_without_mulhigh),
radix);
flint_mpn_zero(U, A_low_zeroes - low_trunc_without_mulhigh);
Uhigh = U;
}
slong Un = control_limb + 1;
if (Bn > n - Un)
{
if (Bn >= n)
{
nn_srcptr Bhigh = B + Bn - n;
radix_sub(Uhigh, Bhigh, Un, Uhigh, Un, radix);
}
else
{
slong nz = n - Bn;
ulong cy = radix_neg(Uhigh, Uhigh, nz, radix);
radix_sub(Uhigh + nz, B, Un - nz, Uhigh + nz, Un - nz, radix);
radix_sub(Uhigh + nz, Uhigh + nz, Un - nz, &cy, 1, radix);
}
Unegative = 1;
if (radix_cmp_bn_half(Uhigh, Un, radix) > 0)
{
radix_neg(Uhigh, Uhigh, Un, radix);
Unegative = 0;
}
}
else
{
Unegative = 0;
if (radix_cmp_bn_half(Uhigh, Un, radix) > 0)
{
radix_neg(Uhigh, Uhigh, Un, radix);
Unegative = 1;
}
}
Uhighn = Un;
while (Uhighn > 0 && Uhigh[Uhighn - 1] == 0)
Uhighn--;
if (Uhighn != 0)
{
if (LIMB_RADIX(radix) > 2 * (m + 2))
{
V = TMP_ALLOC((Tn + Uhighn - (m - 2)) * sizeof(ulong));
radix_mulmid(V, T, Tn, Uhigh, Uhighn, m - 2, Tn + Uhighn, radix);
Vhigh = V + 2;
Vhighn = Tn + Uhighn - (m - 2) - 2;
}
else
{
V = TMP_ALLOC((Tn + Uhighn) * sizeof(ulong));
radix_mul(V, T, Tn, Uhigh, Uhighn, radix);
Vhigh = V + m;
Vhighn = Tn + Uhighn - m;
}
Q[n + 1] = 0;
flint_mpn_copyi(Q + n - m, TBhigh, Tn);
if (Unegative)
radix_add(Q, Q, n + 2, Vhigh, Vhighn, radix);
else
radix_sub(Q, Q, n + 2, Vhigh, Vhighn, radix);
}
else
{
Q[n + 1] = 0;
flint_mpn_copyi(Q + n - m, TBhigh, Tn);
}
TMP_END;
}
#define USE_PREINV2 1
#define PREINV2_EXTRA_LIMBS 2
void
radix_divrem_preinv(nn_ptr Q, nn_ptr R, nn_srcptr A, slong An, nn_srcptr B, slong Bn, nn_srcptr Binv, slong Binvn, const radix_t radix)
{
nn_ptr T, U, q, r;
slong n;
ulong one = 1;
TMP_INIT;
TMP_START;
n = An - Bn + 1;
if (Binvn < n)
flint_throw(FLINT_ERROR, "radix_divrem_preinv: inverse has too few limbs");
slong n2 = n + PREINV2_EXTRA_LIMBS;
#if USE_PREINV2
if (Binvn >= n2 && An >= n2 && LIMB_RADIX(radix) > n2 + 6)
{
U = TMP_ALLOC((n2 + 2) * sizeof(ulong));
radix_mulmid(U, A + An - n2, n2, Binv + Binvn - n2, n2 + 2, n2, 2 * n2 + 2, radix);
FLINT_ASSERT(U[n2 + 1] == 0);
q = U + n2 + 2 - (n + 1);
if (q[-1] > 1 && q[-1] < LIMB_RADIX(radix) - 1)
{
if (R == NULL)
r = NULL;
else
{
T = TMP_ALLOC(Bn * sizeof(ulong));
r = T;
radix_mulmid(r, q, FLINT_MIN(Bn, n + 1), B, Bn, 0, Bn, radix);
radix_sub(r, A, Bn, r, Bn, radix);
}
goto done;
}
}
else
#endif
{
if (LIMB_RADIX(radix) > n + 2)
{
U = TMP_ALLOC((n + 3) * sizeof(ulong));
radix_mulmid(U, A + Bn - 1, n, Binv + Binvn - n, n + 2, An - Bn, 2 * n + 2, radix);
q = U + 2;
}
else
{
U = TMP_ALLOC((2 * n + 2) * sizeof(ulong));
radix_mul(U, A + Bn - 1, n, Binv + Binvn - n, n + 2, radix);
q = U + An - Bn + 2;
}
}
T = TMP_ALLOC((Bn + n) * sizeof(ulong));
r = T;
if (q[n] != 0)
radix_sub(q, q, n + 1, &one, 1, radix);
radix_mul(r, q, n, B, Bn, radix);
while (r[An] != 0 || mpn_cmp(r, A, An) > 0)
{
radix_sub(r, r, An + 1, B, Bn, radix);
radix_sub(q, q, n, &one, 1, radix);
}
radix_sub(r, A, An, r, An, radix);
while (r[Bn] != 0 || mpn_cmp(r, B, Bn) >= 0)
{
radix_add(q, q, n, &one, 1, radix);
radix_sub(r, r, Bn + 1, B, Bn, radix);
}
done:
flint_mpn_copyi(Q, q, An - Bn + 1);
if (R != NULL)
flint_mpn_copyi(R, r, Bn);
TMP_END;
}
void
radix_divrem_newton(nn_ptr Q, nn_ptr R, nn_srcptr A, slong An, nn_srcptr B, slong Bn, const radix_t radix)
{
nn_ptr T;
slong n;
TMP_INIT;
if (flint_mpn_zero_p(A + Bn, An - Bn) && mpn_cmp(A, B, Bn) < 0)
{
if (R != NULL)
flint_mpn_copyi(R, A, Bn);
flint_mpn_zero(Q, An - Bn + 1);
return;
}
TMP_START;
n = An - Bn + 1;
T = TMP_ALLOC((n + PREINV2_EXTRA_LIMBS + 2) * sizeof(ulong));
#if USE_PREINV2
radix_inv_approx(T, B, Bn, n + PREINV2_EXTRA_LIMBS, radix);
radix_divrem_preinv(Q, R, A, An, B, Bn, T, n + PREINV2_EXTRA_LIMBS, radix);
#else
radix_inv_approx(T, B, Bn, n, radix);
radix_divrem_preinv(Q, R, A, An, B, Bn, T, n, radix);
#endif
TMP_END;
return;
}
void
radix_divrem_newton_karp_markstein(nn_ptr Q, nn_ptr R, nn_srcptr A, slong An, nn_srcptr B, slong Bn, const radix_t radix)
{
if (flint_mpn_zero_p(A + Bn, An - Bn) && mpn_cmp(A, B, Bn) < 0)
{
if (R != NULL)
flint_mpn_copyi(R, A, Bn);
flint_mpn_zero(Q, An - Bn + 1);
return;
}
nn_ptr T, U, q, r;
slong n;
ulong one = 1;
TMP_INIT;
TMP_START;
n = An - Bn + 1;
slong n2 = n + PREINV2_EXTRA_LIMBS;
U = TMP_ALLOC((n2 + 2) * sizeof(ulong));
radix_div_approx(U, A, An, B, Bn, n2, radix);
FLINT_ASSERT(U[n2 + 1] == 0);
q = U + n2 + 2 - (n + 1);
if (q[-1] > 1 && q[-1] < LIMB_RADIX(radix) - 1)
{
if (R == NULL)
r = NULL;
else
{
T = TMP_ALLOC(Bn * sizeof(ulong));
r = T;
radix_mulmid(r, q, FLINT_MIN(Bn, n + 1), B, Bn, 0, Bn, radix);
radix_sub(r, A, Bn, r, Bn, radix);
}
goto done;
}
T = TMP_ALLOC((Bn + n) * sizeof(ulong));
r = T;
if (q[n] != 0)
radix_sub(q, q, n + 1, &one, 1, radix);
radix_mul(r, q, n, B, Bn, radix);
while (r[An] != 0 || mpn_cmp(r, A, An) > 0)
{
radix_sub(r, r, An + 1, B, Bn, radix);
radix_sub(q, q, n, &one, 1, radix);
}
radix_sub(r, A, An, r, An, radix);
while (r[Bn] != 0 || mpn_cmp(r, B, Bn) >= 0)
{
radix_add(q, q, n, &one, 1, radix);
radix_sub(r, r, Bn + 1, B, Bn, radix);
}
done:
flint_mpn_copyi(Q, q, An - Bn + 1);
if (R != NULL)
flint_mpn_copyi(R, r, Bn);
TMP_END;
}
void
radix_divrem(nn_ptr q, nn_ptr r, nn_srcptr a, slong an, nn_srcptr b, slong bn, const radix_t radix)
{
FLINT_ASSERT(an >= bn);
FLINT_ASSERT(bn >= 1);
FLINT_ASSERT(b[bn - 1] != 0);
if (bn == 1)
{
ulong r0 = radix_divrem_1(q, a, an, b[0], radix);
if (r != NULL)
r[0] = r0;
return;
}
if ((bn >= 20 && (an - bn + 1) <= 150) || bn >= 70)
{
radix_divrem_newton_karp_markstein(q, r, a, an, b, bn, radix);
return;
}
if (an > 2 * bn)
{
nn_ptr T, R;
slong i;
slong antop;
ulong cy;
TMP_INIT;
FLINT_ASSERT(an > 2 * bn);
TMP_START;
R = TMP_ALLOC(3 * bn * sizeof(ulong));
T = R + bn;
i = (an + bn - 1) / bn - 2;
antop = an - i * bn;
FLINT_ASSERT(antop >= bn);
radix_divrem(q + i * bn, R, a + i * bn, antop, b, bn, radix);
cy = q[i * bn];
for (i--; i >= 0; i--)
{
flint_mpn_copyi(T, a + i * bn, bn);
flint_mpn_copyi(T + bn, R, bn);
radix_divrem(q + i * bn, R, T, 2 * bn, b, bn, radix);
q[(i + 1) * bn] = cy;
cy = q[i * bn];
}
if (r != NULL)
flint_mpn_copyi(r, R, bn);
TMP_END;
return;
}
radix_divrem_via_mpn(q, r, a, an, b, bn, radix);
}
static int
_radix_div_via_mpn(nn_ptr q, nn_srcptr a, slong an, nn_srcptr b, slong bn, const radix_t radix)
{
nn_ptr bq, br, ba, bb, tmp, q2;
slong bqn, ban, brn, bbn, alloc, qn, qn2;
int exact;
TMP_INIT;
FLINT_ASSERT(an >= bn);
FLINT_ASSERT(bn >= 1);
FLINT_ASSERT(b[bn - 1] != 0);
alloc = ((an - bn + 2) + (bn + 1) + (an + 1) + (bn + 1));
TMP_START;
tmp = TMP_ALLOC(alloc * sizeof(ulong));
bq = tmp;
br = bq + (an - bn + 2);
ba = br + (bn + 1);
bb = ba + (an + 1);
ban = radix_get_mpn(ba, a, an, radix);
bbn = radix_get_mpn(bb, b, bn, radix);
bqn = ban - bbn + 1;
brn = bbn;
FLINT_ASSERT(ban >= bbn);
mpn_tdiv_qr(bq, br, 0, ba, ban, bb, bbn);
exact = flint_mpn_zero_p(br, brn);
if (exact)
{
qn2 = radix_set_mpn_need_alloc(bqn, radix);
q2 = TMP_ALLOC(qn2 * sizeof(ulong));
qn = radix_set_mpn(q2, bq, bqn, radix);
flint_mpn_copyi(q, q2, qn);
flint_mpn_zero(q + qn, (an - bn + 1) - qn);
}
TMP_END;
return exact;
}
int
radix_div(nn_ptr q, nn_srcptr a, slong an, nn_srcptr b, slong bn, const radix_t radix)
{
FLINT_ASSERT(an >= bn);
FLINT_ASSERT(bn >= 1);
FLINT_ASSERT(b[bn - 1] != 0);
if (bn == 1)
return radix_divrem_1(q, a, an, b[0], radix) == 0;
if (flint_mpn_zero_p(a + bn, an - bn) && mpn_cmp(a, b, bn) < 0)
{
if (flint_mpn_zero_p(a, bn))
{
flint_mpn_zero(q, an - bn + 1);
return 1;
}
else
{
return 0;
}
}
if (bn >= 48 || an > 2 * bn)
{
int exact;
nn_ptr r;
TMP_INIT;
TMP_START;
r = TMP_ALLOC(bn * sizeof(ulong));
radix_divrem(q, r, a, an, b, bn, radix);
exact = flint_mpn_zero_p(r, bn);
TMP_END;
return exact;
}
return _radix_div_via_mpn(q, a, an, b, bn, radix);
}
void
radix_divexact(nn_ptr q, nn_srcptr a, slong an, nn_srcptr b, slong bn, const radix_t radix)
{
FLINT_ASSERT(an >= bn);
FLINT_ASSERT(bn >= 1);
FLINT_ASSERT(b[bn - 1] != 0);
if (bn == 1)
{
radix_divexact_1(q, a, an, b[0], radix);
return;
}
radix_div(q, a, an, b, bn, radix);
}