#include "nmod.h"
#include "nmod_vec.h"
#if (defined(__AVX2__) && FLINT_BITS == 64)
# include "machine_vectors.h"
#endif
int _nmod_vec_dot_bound_limbs(slong len, nmod_t mod)
{
if (mod.n <= UWORD(1) << (FLINT_BITS / 2)) {
const ulong t0 = (mod.n - 1) * (mod.n - 1);
ulong u1, u0;
umul_ppmm(u1, u0, t0, len);
if (u1 != 0)
return 2;
return (u0 != 0);
}
ulong t2, t1, t0, u1, u0;
umul_ppmm(t1, t0, mod.n - 1, mod.n - 1);
umul_ppmm(t2, t1, t1, len);
umul_ppmm(u1, u0, t0, len);
add_sssaaaaaa(t2, t1, t0, t2, t1, UWORD(0), UWORD(0), u1, u0);
if (t2 != 0)
return 3;
if (t1 != 0)
return 2;
return (t0 != 0);
}
int _nmod_vec_dot_bound_limbs_from_params(slong len, nmod_t mod, dot_params_t params)
{
if (params.method == _DOT_POW2)
return _nmod_vec_dot_bound_limbs(len, mod);
if (params.method == _DOT0)
return 0;
if (params.method <= _DOT1)
return 1;
if (params.method <= _DOT2)
return 2;
return 3;
}
ulong _nmod_vec_dot_pow2(nn_srcptr vec1, nn_srcptr vec2, slong len, nmod_t mod)
{
ulong res; slong i;
_NMOD_VEC_DOT1(res, i, len, vec1[i], vec2[i], mod)
return res;
}
ulong _nmod_vec_dot1(nn_srcptr vec1, nn_srcptr vec2, slong len, nmod_t mod)
#if defined(__AVX2__) && FLINT_BITS == 64
{
vec4n dp = vec4n_zero();
slong i = 0;
for ( ; i+31 < len; i += 32)
{
dp = vec4n_add(dp, vec4n_mul(vec4n_load_unaligned(vec1+i+ 0), vec4n_load_unaligned(vec2+i+ 0)));
dp = vec4n_add(dp, vec4n_mul(vec4n_load_unaligned(vec1+i+ 4), vec4n_load_unaligned(vec2+i+ 4)));
dp = vec4n_add(dp, vec4n_mul(vec4n_load_unaligned(vec1+i+ 8), vec4n_load_unaligned(vec2+i+ 8)));
dp = vec4n_add(dp, vec4n_mul(vec4n_load_unaligned(vec1+i+12), vec4n_load_unaligned(vec2+i+12)));
dp = vec4n_add(dp, vec4n_mul(vec4n_load_unaligned(vec1+i+16), vec4n_load_unaligned(vec2+i+16)));
dp = vec4n_add(dp, vec4n_mul(vec4n_load_unaligned(vec1+i+20), vec4n_load_unaligned(vec2+i+20)));
dp = vec4n_add(dp, vec4n_mul(vec4n_load_unaligned(vec1+i+24), vec4n_load_unaligned(vec2+i+24)));
dp = vec4n_add(dp, vec4n_mul(vec4n_load_unaligned(vec1+i+28), vec4n_load_unaligned(vec2+i+28)));
}
for ( ; i + 3 < len; i += 4)
dp = vec4n_add(dp, vec4n_mul(vec4n_load_unaligned(vec1+i), vec4n_load_unaligned(vec2+i)));
ulong res = vec4n_horizontal_sum(dp);
for (; i < len; i++)
res += vec1[i] * vec2[i];
NMOD_RED(res, res, mod);
return res;
}
#else
{
ulong res; slong i;
_NMOD_VEC_DOT1(res, i, len, vec1[i], vec2[i], mod)
return res;
}
#endif
#if FLINT_BITS == 64
ulong _nmod_vec_dot2_split(nn_srcptr vec1, nn_srcptr vec2, slong len, nmod_t mod, ulong pow2_precomp)
#if defined(__AVX2__)
{
const vec4n low_bits = vec4n_set_n(DOT_SPLIT_MASK);
vec4n dp_lo = vec4n_zero();
vec4n dp_hi = vec4n_zero();
slong i = 0;
for ( ; i+31 < len; i += 32)
{
dp_lo = vec4n_add(dp_lo, vec4n_mul(vec4n_load_unaligned(vec1+i+ 0), vec4n_load_unaligned(vec2+i+ 0)));
dp_lo = vec4n_add(dp_lo, vec4n_mul(vec4n_load_unaligned(vec1+i+ 4), vec4n_load_unaligned(vec2+i+ 4)));
dp_lo = vec4n_add(dp_lo, vec4n_mul(vec4n_load_unaligned(vec1+i+ 8), vec4n_load_unaligned(vec2+i+ 8)));
dp_lo = vec4n_add(dp_lo, vec4n_mul(vec4n_load_unaligned(vec1+i+12), vec4n_load_unaligned(vec2+i+12)));
dp_lo = vec4n_add(dp_lo, vec4n_mul(vec4n_load_unaligned(vec1+i+16), vec4n_load_unaligned(vec2+i+16)));
dp_lo = vec4n_add(dp_lo, vec4n_mul(vec4n_load_unaligned(vec1+i+20), vec4n_load_unaligned(vec2+i+20)));
dp_lo = vec4n_add(dp_lo, vec4n_mul(vec4n_load_unaligned(vec1+i+24), vec4n_load_unaligned(vec2+i+24)));
dp_lo = vec4n_add(dp_lo, vec4n_mul(vec4n_load_unaligned(vec1+i+28), vec4n_load_unaligned(vec2+i+28)));
dp_hi = vec4n_add(dp_hi, vec4n_bit_shift_right(dp_lo, DOT_SPLIT_BITS));
dp_lo = vec4n_bit_and(dp_lo, low_bits);
}
for ( ; i + 3 < len; i += 4)
dp_lo = vec4n_add(dp_lo, vec4n_mul(vec4n_load_unaligned(vec1+i), vec4n_load_unaligned(vec2+i)));
dp_hi = vec4n_add(dp_hi, vec4n_bit_shift_right(dp_lo, DOT_SPLIT_BITS));
dp_lo = vec4n_bit_and(dp_lo, low_bits);
ulong hsum_lo = vec4n_horizontal_sum(dp_lo);
const ulong hsum_hi = vec4n_horizontal_sum(dp_hi) + (hsum_lo >> DOT_SPLIT_BITS);
hsum_lo &= DOT_SPLIT_MASK;
for (; i < len; i++)
hsum_lo += vec1[i] * vec2[i];
ulong res;
NMOD_RED(res, pow2_precomp * hsum_hi + hsum_lo, mod);
return res;
}
#else
{
ulong res; slong i;
_NMOD_VEC_DOT2_SPLIT(res, i, len, vec1[i], vec2[i], mod, pow2_precomp)
return res;
}
#endif #endif
ulong _nmod_vec_dot2_half(nn_srcptr vec1, nn_srcptr vec2, slong len, nmod_t mod, ulong pow2_precomp)
#if FLINT_BITS == 64 && defined(__AVX2__)
{
const vec4n low_bits = vec4n_set_n(DOT_SPLIT_MASK);
vec4n dp_lo = vec4n_zero();
vec4n dp_hi = vec4n_zero();
slong i = 0;
for ( ; i+255 < len; i += 256)
{
ulong j = 0;
for ( ; j+3 < 256; j += 4)
{
__m256i prod = vec4n_mul(vec4n_load_unaligned(vec1+i+j), vec4n_load_unaligned(vec2+i+j));
dp_hi = vec4n_add(dp_hi, vec4n_bit_shift_right(prod, DOT_SPLIT_BITS));
dp_lo = vec4n_add(dp_lo, vec4n_bit_and(prod, low_bits));
}
dp_hi = vec4n_add(dp_hi, vec4n_bit_shift_right(dp_lo, DOT_SPLIT_BITS));
dp_lo = vec4n_bit_and(dp_lo, low_bits);
}
for ( ; i+3 < len; i += 4)
{
__m256i prod = vec4n_mul(vec4n_load_unaligned(vec1+i), vec4n_load_unaligned(vec2+i));
dp_hi = vec4n_add(dp_hi, vec4n_bit_shift_right(prod, DOT_SPLIT_BITS));
dp_lo = vec4n_add(dp_lo, vec4n_bit_and(prod, low_bits));
}
ulong hsum_lo = vec4n_horizontal_sum(dp_lo);
ulong hsum_hi = vec4n_horizontal_sum(dp_hi) + (hsum_lo >> DOT_SPLIT_BITS);
hsum_lo &= DOT_SPLIT_MASK;
for ( ; i < len; i++)
{
ulong prod = vec1[i] * vec2[i];
hsum_hi += (prod >> DOT_SPLIT_BITS);
hsum_lo += (prod & DOT_SPLIT_MASK);
}
ulong res;
NMOD_RED(res, pow2_precomp * hsum_hi + hsum_lo, mod);
return res;
}
#else
{
ulong res; slong i;
_NMOD_VEC_DOT2_HALF(res, i, len, vec1[i], vec2[i], mod)
return res;
}
#endif
ulong _nmod_vec_dot2(nn_srcptr vec1, nn_srcptr vec2, slong len, nmod_t mod)
{
ulong res; slong i;
_NMOD_VEC_DOT2(res, i, len, vec1[i], vec2[i], mod)
return res;
}
ulong _nmod_vec_dot3(nn_srcptr vec1, nn_srcptr vec2, slong len, nmod_t mod)
{
ulong res; slong i;
_NMOD_VEC_DOT3(res, i, len, vec1[i], vec2[i], mod)
return res;
}
ulong _nmod_vec_dot3_acc(nn_srcptr vec1, nn_srcptr vec2, slong len, nmod_t mod)
{
ulong res; slong i;
_NMOD_VEC_DOT3_ACC(res, i, len, vec1[i], vec2[i], mod)
return res;
}
ulong _nmod_vec_dot_pow2_rev(nn_srcptr vec1, nn_srcptr vec2, slong len, nmod_t mod)
{
ulong res; slong i;
_NMOD_VEC_DOT1(res, i, len, vec1[i], vec2[len-1-i], mod)
return res;
}
ulong _nmod_vec_dot1_rev(nn_srcptr vec1, nn_srcptr vec2, slong len, nmod_t mod)
#if defined(__AVX2__) && FLINT_BITS == 64
{
vec4n dp = vec4n_zero();
slong i = 0;
for ( ; i+31 < len; i += 32)
{
const ulong ii = len - 32 - i; dp = vec4n_add(dp, vec4n_mul(vec4n_load_unaligned(vec1+i+ 0), vec4n_permute_3_2_1_0(vec4n_load_unaligned(vec2+ii+28))));
dp = vec4n_add(dp, vec4n_mul(vec4n_load_unaligned(vec1+i+ 4), vec4n_permute_3_2_1_0(vec4n_load_unaligned(vec2+ii+24))));
dp = vec4n_add(dp, vec4n_mul(vec4n_load_unaligned(vec1+i+ 8), vec4n_permute_3_2_1_0(vec4n_load_unaligned(vec2+ii+20))));
dp = vec4n_add(dp, vec4n_mul(vec4n_load_unaligned(vec1+i+12), vec4n_permute_3_2_1_0(vec4n_load_unaligned(vec2+ii+16))));
dp = vec4n_add(dp, vec4n_mul(vec4n_load_unaligned(vec1+i+16), vec4n_permute_3_2_1_0(vec4n_load_unaligned(vec2+ii+12))));
dp = vec4n_add(dp, vec4n_mul(vec4n_load_unaligned(vec1+i+20), vec4n_permute_3_2_1_0(vec4n_load_unaligned(vec2+ii+ 8))));
dp = vec4n_add(dp, vec4n_mul(vec4n_load_unaligned(vec1+i+24), vec4n_permute_3_2_1_0(vec4n_load_unaligned(vec2+ii+ 4))));
dp = vec4n_add(dp, vec4n_mul(vec4n_load_unaligned(vec1+i+28), vec4n_permute_3_2_1_0(vec4n_load_unaligned(vec2+ii+ 0))));
}
for ( ; i + 3 < len; i += 4)
dp = vec4n_add(dp, vec4n_mul(vec4n_load_unaligned(vec1+i), vec4n_permute_3_2_1_0(vec4n_load_unaligned(vec2+len-4-i))));
ulong res = vec4n_horizontal_sum(dp);
for (; i < len; i++)
res += vec1[i] * vec2[len-1-i];
NMOD_RED(res, res, mod);
return res;
}
#else
{
ulong res; slong i;
_NMOD_VEC_DOT1(res, i, len, vec1[i], vec2[len-1-i], mod)
return res;
}
#endif
#if FLINT_BITS == 64
ulong _nmod_vec_dot2_split_rev(nn_srcptr vec1, nn_srcptr vec2, slong len, nmod_t mod, ulong pow2_precomp)
#if defined(__AVX2__)
{
const vec4n low_bits = vec4n_set_n(DOT_SPLIT_MASK);
vec4n dp_lo = vec4n_zero();
vec4n dp_hi = vec4n_zero();
slong i = 0;
for ( ; i+31 < len; i += 32)
{
const ulong ii = len - 32 - i; dp_lo = vec4n_add(dp_lo, vec4n_mul(vec4n_load_unaligned(vec1+i+ 0), vec4n_permute_3_2_1_0(vec4n_load_unaligned(vec2+ii+28))));
dp_lo = vec4n_add(dp_lo, vec4n_mul(vec4n_load_unaligned(vec1+i+ 4), vec4n_permute_3_2_1_0(vec4n_load_unaligned(vec2+ii+24))));
dp_lo = vec4n_add(dp_lo, vec4n_mul(vec4n_load_unaligned(vec1+i+ 8), vec4n_permute_3_2_1_0(vec4n_load_unaligned(vec2+ii+20))));
dp_lo = vec4n_add(dp_lo, vec4n_mul(vec4n_load_unaligned(vec1+i+12), vec4n_permute_3_2_1_0(vec4n_load_unaligned(vec2+ii+16))));
dp_lo = vec4n_add(dp_lo, vec4n_mul(vec4n_load_unaligned(vec1+i+16), vec4n_permute_3_2_1_0(vec4n_load_unaligned(vec2+ii+12))));
dp_lo = vec4n_add(dp_lo, vec4n_mul(vec4n_load_unaligned(vec1+i+20), vec4n_permute_3_2_1_0(vec4n_load_unaligned(vec2+ii+ 8))));
dp_lo = vec4n_add(dp_lo, vec4n_mul(vec4n_load_unaligned(vec1+i+24), vec4n_permute_3_2_1_0(vec4n_load_unaligned(vec2+ii+ 4))));
dp_lo = vec4n_add(dp_lo, vec4n_mul(vec4n_load_unaligned(vec1+i+28), vec4n_permute_3_2_1_0(vec4n_load_unaligned(vec2+ii+ 0))));
dp_hi = vec4n_add(dp_hi, vec4n_bit_shift_right(dp_lo, DOT_SPLIT_BITS));
dp_lo = vec4n_bit_and(dp_lo, low_bits);
}
for ( ; i + 3 < len; i += 4)
dp_lo = vec4n_add(dp_lo, vec4n_mul(vec4n_load_unaligned(vec1+i), vec4n_permute_3_2_1_0(vec4n_load_unaligned(vec2+len-4-i))));
dp_hi = vec4n_add(dp_hi, vec4n_bit_shift_right(dp_lo, DOT_SPLIT_BITS));
dp_lo = vec4n_bit_and(dp_lo, low_bits);
ulong hsum_lo = vec4n_horizontal_sum(dp_lo);
const ulong hsum_hi = vec4n_horizontal_sum(dp_hi) + (hsum_lo >> DOT_SPLIT_BITS);
hsum_lo &= DOT_SPLIT_MASK;
for (; i < len; i++)
hsum_lo += vec1[i] * vec2[len-1-i];
ulong res;
NMOD_RED(res, pow2_precomp * hsum_hi + hsum_lo, mod);
return res;
}
#else
{
ulong res; slong i;
_NMOD_VEC_DOT2_SPLIT(res, i, len, vec1[i], vec2[len-1-i], mod, pow2_precomp)
return res;
}
#endif #endif
ulong _nmod_vec_dot2_half_rev(nn_srcptr vec1, nn_srcptr vec2, slong len, nmod_t mod)
{
ulong res; slong i;
_NMOD_VEC_DOT2_HALF(res, i, len, vec1[i], vec2[len-1-i], mod)
return res;
}
ulong _nmod_vec_dot2_rev(nn_srcptr vec1, nn_srcptr vec2, slong len, nmod_t mod)
{
ulong res; slong i;
_NMOD_VEC_DOT2(res, i, len, vec1[i], vec2[len-1-i], mod)
return res;
}
ulong _nmod_vec_dot3_acc_rev(nn_srcptr vec1, nn_srcptr vec2, slong len, nmod_t mod)
{
ulong res; slong i;
_NMOD_VEC_DOT3_ACC(res, i, len, vec1[i], vec2[len-1-i], mod)
return res;
}
ulong _nmod_vec_dot3_rev(nn_srcptr vec1, nn_srcptr vec2, slong len, nmod_t mod)
{
ulong res; slong i;
_NMOD_VEC_DOT3(res, i, len, vec1[i], vec2[len-1-i], mod)
return res;
}
ulong _nmod_vec_dot_pow2_ptr(nn_srcptr vec1, const nn_ptr * vec2, slong offset, slong len, nmod_t mod)
{
ulong res; slong i;
_NMOD_VEC_DOT1(res, i, len, vec1[i], vec2[i][offset], mod)
return res;
}
ulong _nmod_vec_dot1_ptr(nn_srcptr vec1, const nn_ptr * vec2, slong offset, slong len, nmod_t mod)
#if defined(__AVX2__) && FLINT_BITS == 64
{
vec4n dp = vec4n_zero();
slong i = 0;
for ( ; i+31 < len; i += 32)
{
vec4n vec2_4n;
vec2_4n = vec4n_set_n4(vec2[i+ 0][offset], vec2[i+ 1][offset], vec2[i+ 2][offset], vec2[i+ 3][offset]);
dp = vec4n_add(dp, vec4n_mul(vec4n_load_unaligned(vec1+i+ 0), vec2_4n));
vec2_4n = vec4n_set_n4(vec2[i+ 4][offset], vec2[i+ 5][offset], vec2[i+ 6][offset], vec2[i+ 7][offset]);
dp = vec4n_add(dp, vec4n_mul(vec4n_load_unaligned(vec1+i+ 4), vec2_4n));
vec2_4n = vec4n_set_n4(vec2[i+ 8][offset], vec2[i+ 9][offset], vec2[i+10][offset], vec2[i+11][offset]);
dp = vec4n_add(dp, vec4n_mul(vec4n_load_unaligned(vec1+i+ 8), vec2_4n));
vec2_4n = vec4n_set_n4(vec2[i+12][offset], vec2[i+13][offset], vec2[i+14][offset], vec2[i+15][offset]);
dp = vec4n_add(dp, vec4n_mul(vec4n_load_unaligned(vec1+i+12), vec2_4n));
vec2_4n = vec4n_set_n4(vec2[i+16][offset], vec2[i+17][offset], vec2[i+18][offset], vec2[i+19][offset]);
dp = vec4n_add(dp, vec4n_mul(vec4n_load_unaligned(vec1+i+16), vec2_4n));
vec2_4n = vec4n_set_n4(vec2[i+20][offset], vec2[i+21][offset], vec2[i+22][offset], vec2[i+23][offset]);
dp = vec4n_add(dp, vec4n_mul(vec4n_load_unaligned(vec1+i+20), vec2_4n));
vec2_4n = vec4n_set_n4(vec2[i+24][offset], vec2[i+25][offset], vec2[i+26][offset], vec2[i+27][offset]);
dp = vec4n_add(dp, vec4n_mul(vec4n_load_unaligned(vec1+i+24), vec2_4n));
vec2_4n = vec4n_set_n4(vec2[i+28][offset], vec2[i+29][offset], vec2[i+30][offset], vec2[i+31][offset]);
dp = vec4n_add(dp, vec4n_mul(vec4n_load_unaligned(vec1+i+28), vec2_4n));
}
for ( ; i + 3 < len; i += 4)
{
vec4n vec2_4n = vec4n_set_n4(vec2[i+0][offset], vec2[i+1][offset], vec2[i+2][offset], vec2[i+3][offset]);
dp = vec4n_add(dp, vec4n_mul(vec4n_load_unaligned(vec1+i), vec2_4n));
}
ulong res = vec4n_horizontal_sum(dp);
for (; i < len; i++)
res += vec1[i] * vec2[i][offset];
NMOD_RED(res, res, mod);
return res;
}
#else
{
ulong res; slong i;
_NMOD_VEC_DOT1(res, i, len, vec1[i], vec2[i][offset], mod)
return res;
}
#endif
#if FLINT_BITS == 64
ulong _nmod_vec_dot2_split_ptr(nn_srcptr vec1, const nn_ptr * vec2, slong offset, slong len, nmod_t mod, ulong pow2_precomp)
#if defined(__AVX2__)
{
const vec4n low_bits = vec4n_set_n(DOT_SPLIT_MASK);
vec4n dp_lo = vec4n_zero();
vec4n dp_hi = vec4n_zero();
slong i = 0;
for ( ; i+31 < len; i += 32)
{
vec4n vec2_4n;
vec2_4n = vec4n_set_n4(vec2[i+ 0][offset], vec2[i+ 1][offset], vec2[i+ 2][offset], vec2[i+ 3][offset]);
dp_lo = vec4n_add(dp_lo, vec4n_mul(vec4n_load_unaligned(vec1+i+ 0), vec2_4n));
vec2_4n = vec4n_set_n4(vec2[i+ 4][offset], vec2[i+ 5][offset], vec2[i+ 6][offset], vec2[i+ 7][offset]);
dp_lo = vec4n_add(dp_lo, vec4n_mul(vec4n_load_unaligned(vec1+i+ 4), vec2_4n));
vec2_4n = vec4n_set_n4(vec2[i+ 8][offset], vec2[i+ 9][offset], vec2[i+10][offset], vec2[i+11][offset]);
dp_lo = vec4n_add(dp_lo, vec4n_mul(vec4n_load_unaligned(vec1+i+ 8), vec2_4n));
vec2_4n = vec4n_set_n4(vec2[i+12][offset], vec2[i+13][offset], vec2[i+14][offset], vec2[i+15][offset]);
dp_lo = vec4n_add(dp_lo, vec4n_mul(vec4n_load_unaligned(vec1+i+12), vec2_4n));
vec2_4n = vec4n_set_n4(vec2[i+16][offset], vec2[i+17][offset], vec2[i+18][offset], vec2[i+19][offset]);
dp_lo = vec4n_add(dp_lo, vec4n_mul(vec4n_load_unaligned(vec1+i+16), vec2_4n));
vec2_4n = vec4n_set_n4(vec2[i+20][offset], vec2[i+21][offset], vec2[i+22][offset], vec2[i+23][offset]);
dp_lo = vec4n_add(dp_lo, vec4n_mul(vec4n_load_unaligned(vec1+i+20), vec2_4n));
vec2_4n = vec4n_set_n4(vec2[i+24][offset], vec2[i+25][offset], vec2[i+26][offset], vec2[i+27][offset]);
dp_lo = vec4n_add(dp_lo, vec4n_mul(vec4n_load_unaligned(vec1+i+24), vec2_4n));
vec2_4n = vec4n_set_n4(vec2[i+28][offset], vec2[i+29][offset], vec2[i+30][offset], vec2[i+31][offset]);
dp_lo = vec4n_add(dp_lo, vec4n_mul(vec4n_load_unaligned(vec1+i+28), vec2_4n));
dp_hi = vec4n_add(dp_hi, vec4n_bit_shift_right(dp_lo, DOT_SPLIT_BITS));
dp_lo = vec4n_bit_and(dp_lo, low_bits);
}
for ( ; i + 3 < len; i += 4)
{
vec4n vec2_4n = vec4n_set_n4(vec2[i+0][offset], vec2[i+1][offset], vec2[i+2][offset], vec2[i+3][offset]);
dp_lo = vec4n_add(dp_lo, vec4n_mul(vec4n_load_unaligned(vec1+i), vec2_4n));
}
dp_hi = vec4n_add(dp_hi, vec4n_bit_shift_right(dp_lo, DOT_SPLIT_BITS));
dp_lo = vec4n_bit_and(dp_lo, low_bits);
ulong hsum_lo = vec4n_horizontal_sum(dp_lo);
const ulong hsum_hi = vec4n_horizontal_sum(dp_hi) + (hsum_lo >> DOT_SPLIT_BITS);
hsum_lo &= DOT_SPLIT_MASK;
for (; i < len; i++)
hsum_lo += vec1[i] * vec2[i][offset];
ulong res;
NMOD_RED(res, pow2_precomp * hsum_hi + hsum_lo, mod);
return res;
}
#else
{
ulong res; slong i;
_NMOD_VEC_DOT2_SPLIT(res, i, len, vec1[i], vec2[i][offset], mod, pow2_precomp)
return res;
}
#endif #endif
ulong _nmod_vec_dot2_half_ptr(nn_srcptr vec1, const nn_ptr * vec2, slong offset, slong len, nmod_t mod)
{
ulong res; slong i;
_NMOD_VEC_DOT2_HALF(res, i, len, vec1[i], vec2[i][offset], mod)
return res;
}
ulong _nmod_vec_dot2_ptr(nn_srcptr vec1, const nn_ptr * vec2, slong offset, slong len, nmod_t mod)
{
ulong res; slong i;
_NMOD_VEC_DOT2(res, i, len, vec1[i], vec2[i][offset], mod)
return res;
}
ulong _nmod_vec_dot3_acc_ptr(nn_srcptr vec1, const nn_ptr * vec2, slong offset, slong len, nmod_t mod)
{
ulong res; slong i;
_NMOD_VEC_DOT3_ACC(res, i, len, vec1[i], vec2[i][offset], mod)
return res;
}
ulong _nmod_vec_dot3_ptr(nn_srcptr vec1, const nn_ptr * vec2, slong offset, slong len, nmod_t mod)
{
ulong res; slong i;
_NMOD_VEC_DOT3(res, i, len, vec1[i], vec2[i][offset], mod)
return res;
}