#ifndef NMOD_VEC_H
#define NMOD_VEC_H
#ifdef NMOD_VEC_INLINES_C
#define NMOD_VEC_INLINE
#else
#define NMOD_VEC_INLINE static inline
#endif
#include "flint.h"
#include "nmod.h"
#ifdef __cplusplus
extern "C" {
#endif
#define NMOD_VEC_NORM(vec, i) \
do { \
while ((i) && vec[(i) - 1] == UWORD(0)) \
(i)--; \
} while (0)
NMOD_VEC_INLINE
nn_ptr _nmod_vec_init(slong len)
{
return (nn_ptr) flint_malloc(len * sizeof(ulong));
}
NMOD_VEC_INLINE
void _nmod_vec_clear(nn_ptr vec)
{
flint_free(vec);
}
void _nmod_vec_randtest(nn_ptr vec, flint_rand_t state, slong len, nmod_t mod);
void _nmod_vec_rand(nn_ptr vec, flint_rand_t state, slong len, nmod_t mod);
NMOD_VEC_INLINE
void _nmod_vec_zero(nn_ptr vec, slong len)
{
slong i;
for (i = 0; i < len; i++)
vec[i] = 0;
}
flint_bitcnt_t _nmod_vec_max_bits(nn_srcptr vec, slong len);
NMOD_VEC_INLINE
void _nmod_vec_set(nn_ptr res, nn_srcptr vec, slong len)
{
slong i;
for (i = 0; i < len; i++)
res[i] = vec[i];
}
NMOD_VEC_INLINE
void _nmod_vec_swap(nn_ptr a, nn_ptr b, slong length)
{
slong i;
for (i = 0; i < length; i++)
{
ulong t = a[i];
a[i] = b[i];
b[i] = t;
}
}
NMOD_VEC_INLINE
int _nmod_vec_equal(nn_srcptr vec, nn_srcptr vec2, slong len)
{
slong i;
for (i = 0; i < len; i++)
if (vec[i] != vec2[i]) return 0;
return 1;
}
NMOD_VEC_INLINE
int _nmod_vec_is_zero(nn_srcptr vec, slong len)
{
slong i;
for (i = 0; i < len; i++)
if (vec[i] != 0) return 0;
return 1;
}
#ifdef FLINT_HAVE_FILE
int _nmod_vec_fprint_pretty(FILE * file, nn_srcptr vec, slong len, nmod_t mod);
int _nmod_vec_fprint(FILE * f, nn_srcptr vec, slong len, nmod_t mod);
#endif
void _nmod_vec_print_pretty(nn_srcptr vec, slong len, nmod_t mod);
int _nmod_vec_print(nn_srcptr vec, slong len, nmod_t mod);
void _nmod_vec_reduce(nn_ptr res, nn_srcptr vec, slong len, nmod_t mod);
void _nmod_vec_add(nn_ptr res, nn_srcptr vec1, nn_srcptr vec2, slong len, nmod_t mod);
void _nmod_vec_sub(nn_ptr res, nn_srcptr vec1, nn_srcptr vec2, slong len, nmod_t mod);
void _nmod_vec_neg(nn_ptr res, nn_srcptr vec, slong len, nmod_t mod);
void _nmod_vec_scalar_mul_nmod(nn_ptr res, nn_srcptr vec, slong len, ulong c, nmod_t mod);
void _nmod_vec_scalar_mul_nmod_redc(nn_ptr res, nn_srcptr vec, slong len, ulong c, nmod_t mod);
void _nmod_vec_scalar_mul_nmod_generic(nn_ptr res, nn_srcptr vec, slong len, ulong c, nmod_t mod);
void _nmod_vec_scalar_mul_nmod_shoup(nn_ptr res, nn_srcptr vec, slong len, ulong c, nmod_t mod);
void _nmod_vec_scalar_addmul_nmod(nn_ptr res, nn_srcptr vec, slong len, ulong c, nmod_t mod);
void _nmod_vec_scalar_addmul_nmod_generic(nn_ptr res, nn_srcptr vec, slong len, ulong c, nmod_t mod);
void _nmod_vec_scalar_addmul_nmod_shoup(nn_ptr res, nn_srcptr vec, slong len, ulong c, nmod_t mod);
#if FLINT_BITS == 64 && defined(__AVX2__)
void _nmod_vec_nored_scalar_addmul_halflimb_avx2(nn_ptr res, nn_srcptr vec, slong len, ulong c);
#endif
NMOD_VEC_INLINE void
_nmod_vec_nored_scalar_addmul_halflimb(nn_ptr res, nn_srcptr vec, slong len, ulong c)
{
#if FLINT_BITS == 64 && defined(__AVX2__)
if (len >= 16)
{
_nmod_vec_nored_scalar_addmul_halflimb_avx2(res, vec, len, c);
return;
}
#endif
slong i;
for (i = 0; i < len; i++)
res[i] += vec[i] * c;
}
void _nmod_vec_nored_scalar_addmul_halflimb(nn_ptr res, nn_srcptr vec, slong len, ulong c);
void _nmod_vec_nored_ll_scalar_addmul_halflimb(nn_ptr res, nn_srcptr vec, slong len, ulong c);
void _nmod_vec_nored_ll_scalar_addmul(nn_ptr res, nn_srcptr vec, slong len, ulong c);
void _nmod_vec_nored_lll_scalar_addmul(nn_ptr res, nn_srcptr vec, slong len, ulong c);
void _nmod_vec_invert(nn_ptr res, nn_srcptr vec, ulong len, nmod_t mod);
void _nmod_vec_invert_naive(nn_ptr res, nn_srcptr vec, ulong len, nmod_t mod);
void _nmod_vec_invert_generic(nn_ptr res, nn_srcptr vec, ulong len, nmod_t mod);
void _nmod_vec_invert_shoup(nn_ptr res, nn_srcptr vec, ulong len, nmod_t mod);
#if (FLINT_BITS == 64)
# define DOT_SPLIT_BITS 56
# define DOT_SPLIT_MASK UWORD(72057594037927935)
#endif
#define _FIXED_LEN_MOD_BOUNDS(fixedlen, onelimb_bnd, twolimb_bnd) \
if (len == fixedlen) \
{ \
if (mod.n <= UWORD(onelimb_bnd)) \
{ \
dot_params_t params = {_DOT1, UWORD(0)}; \
return params; \
} \
if (mod.n <= UWORD(twolimb_bnd)) \
{ \
dot_params_t params = {_DOT2, UWORD(0)}; \
return params; \
} \
dot_params_t params = {_DOT3, UWORD(0)}; \
return params; \
}
FLINT_FORCE_INLINE dot_params_t _nmod_vec_dot_params(ulong len, nmod_t mod)
{
if (len == 0 || mod.n == 1)
{
dot_params_t params = {_DOT0, UWORD(0)};
return params;
}
if ((mod.n & (mod.n - 1)) == 0)
{
dot_params_t params = {_DOT_POW2, UWORD(0)};
return params;
}
if (len <= 11)
{
#if FLINT_BITS == 64
_FIXED_LEN_MOD_BOUNDS(11, 1294981365, 5561902608746059656);
_FIXED_LEN_MOD_BOUNDS(10, 1358187914, 5833372668713515885);
_FIXED_LEN_MOD_BOUNDS( 9, 1431655766, 6148914691236517206);
_FIXED_LEN_MOD_BOUNDS( 8, 1518500250, 6521908912666391107);
_FIXED_LEN_MOD_BOUNDS( 7, 1623345051, 6972213902555716131);
_FIXED_LEN_MOD_BOUNDS( 6, 1753413057, 7530851732716320753);
_FIXED_LEN_MOD_BOUNDS( 5, 1920767767, 8249634742471189718);
_FIXED_LEN_MOD_BOUNDS( 4, 2147483648, 9223372036854775808);
_FIXED_LEN_MOD_BOUNDS( 3, 2479700525, 10650232656628343402);
_FIXED_LEN_MOD_BOUNDS( 2, 3037000500, 13043817825332782213);
#else
_FIXED_LEN_MOD_BOUNDS(11, 19760, 1294981365);
_FIXED_LEN_MOD_BOUNDS(10, 20725, 1358187914);
_FIXED_LEN_MOD_BOUNDS( 9, 21846, 1431655766);
_FIXED_LEN_MOD_BOUNDS( 8, 23171, 1518500250);
_FIXED_LEN_MOD_BOUNDS( 7, 24771, 1623345051);
_FIXED_LEN_MOD_BOUNDS( 6, 26755, 1753413057);
_FIXED_LEN_MOD_BOUNDS( 5, 29309, 1920767767);
_FIXED_LEN_MOD_BOUNDS( 4, 32768, 2147483648);
_FIXED_LEN_MOD_BOUNDS( 3, 37838, 2479700525);
_FIXED_LEN_MOD_BOUNDS( 2, 46341, 3037000500);
#endif if (mod.n <= (UWORD(1) << FLINT_BITS / 2))
{
dot_params_t params = {_DOT1, UWORD(0)};
return params;
}
dot_params_t params = {_DOT2, UWORD(0)};
return params;
}
if (mod.n <= UWORD(1) << (FLINT_BITS / 2)) {
const ulong t0 = (mod.n - 1) * (mod.n - 1);
ulong u1, u0;
umul_ppmm(u1, u0, t0, len);
if (u1 == 0) {
dot_params_t params = {_DOT1, UWORD(0)};
return params;
}
#if (FLINT_BITS == 64)
if (mod.n <= UWORD(1515531528) && len <= WORD(380368697))
{
ulong pow2_precomp;
NMOD_RED(pow2_precomp, (UWORD(1) << DOT_SPLIT_BITS), mod);
dot_params_t params = {_DOT2_SPLIT, pow2_precomp};
return params;
}
#endif #if (FLINT_BITS == 64) && defined(__AVX2__)
ulong pow2_precomp;
NMOD_RED(pow2_precomp, (UWORD(1) << DOT_SPLIT_BITS), mod);
dot_params_t params = {_DOT2_HALF, pow2_precomp};
return params;
#else
dot_params_t params = {_DOT2_HALF, UWORD(0)};
return params;
#endif }
ulong t2, t1, t0, u1, u0;
umul_ppmm(t1, t0, mod.n - 1, mod.n - 1);
umul_ppmm(t2, t1, t1, len);
umul_ppmm(u1, u0, t0, len);
add_ssaaaa(t2, t1, t2, t1, UWORD(0), u1);
if (t2 == 0) {
dot_params_t params = {_DOT2, UWORD(0)};
return params;
}
#if (FLINT_BITS == 64)
if (mod.n <= UWORD(6521908912666391107)) #else
if (mod.n <= UWORD(1518500250)) #endif
{
dot_params_t params = {_DOT3_ACC, UWORD(0)};
return params;
}
dot_params_t params = {_DOT3, UWORD(0)};
return params;
}
#undef _FIXED_LEN_MOD_BOUNDS
int _nmod_vec_dot_bound_limbs(slong len, nmod_t mod);
int _nmod_vec_dot_bound_limbs_from_params(slong len, nmod_t mod, dot_params_t params);
ulong _nmod_vec_dot_pow2(nn_srcptr vec1, nn_srcptr vec2, slong len, nmod_t mod);
ulong _nmod_vec_dot1(nn_srcptr vec1, nn_srcptr vec2, slong len, nmod_t mod);
ulong _nmod_vec_dot2_half(nn_srcptr vec1, nn_srcptr vec2, slong len, nmod_t mod, ulong pow2_precomp);
ulong _nmod_vec_dot2(nn_srcptr vec1, nn_srcptr vec2, slong len, nmod_t mod);
ulong _nmod_vec_dot3_acc(nn_srcptr vec1, nn_srcptr vec2, slong len, nmod_t mod);
ulong _nmod_vec_dot3(nn_srcptr vec1, nn_srcptr vec2, slong len, nmod_t mod);
#if FLINT_BITS == 64
ulong _nmod_vec_dot2_split(nn_srcptr vec1, nn_srcptr vec2, slong len, nmod_t mod, ulong pow2_precomp);
#endif
ulong _nmod_vec_dot_pow2_rev(nn_srcptr vec1, nn_srcptr vec2, slong len, nmod_t mod);
ulong _nmod_vec_dot1_rev(nn_srcptr vec1, nn_srcptr vec2, slong len, nmod_t mod);
ulong _nmod_vec_dot2_half_rev(nn_srcptr vec1, nn_srcptr vec2, slong len, nmod_t mod);
ulong _nmod_vec_dot2_rev(nn_srcptr vec1, nn_srcptr vec2, slong len, nmod_t mod);
ulong _nmod_vec_dot3_acc_rev(nn_srcptr vec1, nn_srcptr vec2, slong len, nmod_t mod);
ulong _nmod_vec_dot3_rev(nn_srcptr vec1, nn_srcptr vec2, slong len, nmod_t mod);
#if FLINT_BITS == 64
ulong _nmod_vec_dot2_split_rev(nn_srcptr vec1, nn_srcptr vec2, slong len, nmod_t mod, ulong pow2_precomp);
#endif
ulong _nmod_vec_dot_pow2_ptr(nn_srcptr vec1, const nn_ptr * vec2, slong offset, slong len, nmod_t mod);
ulong _nmod_vec_dot1_ptr(nn_srcptr vec1, const nn_ptr * vec2, slong offset, slong len, nmod_t mod);
ulong _nmod_vec_dot2_half_ptr(nn_srcptr vec1, const nn_ptr * vec2, slong offset, slong len, nmod_t mod);
ulong _nmod_vec_dot2_ptr(nn_srcptr vec1, const nn_ptr * vec2, slong offset, slong len, nmod_t mod);
ulong _nmod_vec_dot3_acc_ptr(nn_srcptr vec1, const nn_ptr * vec2, slong offset, slong len, nmod_t mod);
ulong _nmod_vec_dot3_ptr(nn_srcptr vec1, const nn_ptr * vec2, slong offset, slong len, nmod_t mod);
#if FLINT_BITS == 64
ulong _nmod_vec_dot2_split_ptr(nn_srcptr vec1, const nn_ptr * vec2, slong offset, slong len, nmod_t mod, ulong pow2_precomp);
#endif
#define _NMOD_VEC_DOT_SHORT1(fixedlen,expr1,expr2) \
{ \
ulong res = (expr1) * (expr2); i++; \
for (slong j = 0; j < fixedlen-1; j++, i++) \
res += (expr1) * (expr2); \
NMOD_RED(res, res, mod); \
return res; \
} \
#define _NMOD_VEC_DOT_SHORT2(fixedlen,expr1,expr2) \
{ \
ulong s0, s1, u0, u1; \
umul_ppmm(u1, u0, (expr1), (expr2)); i++; \
for (slong j = 0; j < fixedlen-1; j++, i++) \
{ \
umul_ppmm(s1, s0, (expr1), (expr2)); \
add_ssaaaa(u1, u0, u1, u0, s1, s0); \
} \
NMOD2_RED2(s0, u1, u0, mod); \
return s0; \
} \
#define _NMOD_VEC_DOT_SHORT3(fixedlen,expr1,expr2) \
{ \
ulong t2 = UWORD(0); \
ulong t1, t0; \
umul_ppmm(t1, t0, (expr1), (expr2)); i++; \
for (slong j = 0; j < fixedlen - 1; j++, i++) \
{ \
ulong s0, s1; \
umul_ppmm(s1, s0, (expr1), (expr2)); \
add_sssaaaaaa(t2, t1, t0, \
t2, t1, t0, \
UWORD(0), s1, s0); \
} \
\
NMOD_RED(t2, t2, mod); \
ulong res; \
NMOD_RED3(res, t2, t1, t0, mod); \
return res; \
} \
#define _NMOD_VEC_DOT_SHORT(i, expr1, expr2, len, mod, method) \
{ \
if (method == _DOT1 || method == _DOT_POW2) \
{ \
if (len == 1) _NMOD_VEC_DOT_SHORT1( 1, expr1, expr2) \
if (len == 2) _NMOD_VEC_DOT_SHORT1( 2, expr1, expr2) \
if (len == 3) _NMOD_VEC_DOT_SHORT1( 3, expr1, expr2) \
if (len == 4) _NMOD_VEC_DOT_SHORT1( 4, expr1, expr2) \
if (len == 5) _NMOD_VEC_DOT_SHORT1( 5, expr1, expr2) \
if (len == 6) _NMOD_VEC_DOT_SHORT1( 6, expr1, expr2) \
if (len == 7) _NMOD_VEC_DOT_SHORT1( 7, expr1, expr2) \
if (len == 8) _NMOD_VEC_DOT_SHORT1( 8, expr1, expr2) \
if (len == 9) _NMOD_VEC_DOT_SHORT1( 9, expr1, expr2) \
if (len == 10) _NMOD_VEC_DOT_SHORT1(10, expr1, expr2) \
_NMOD_VEC_DOT_SHORT1(11, expr1, expr2) \
} \
\
else if (method == _DOT2) \
{ \
if (len == 1) return nmod_mul((expr1), (expr2), mod); \
if (len == 2) _NMOD_VEC_DOT_SHORT2( 2, expr1, expr2) \
if (len == 3) _NMOD_VEC_DOT_SHORT2( 3, expr1, expr2) \
if (len == 4) _NMOD_VEC_DOT_SHORT2( 4, expr1, expr2) \
if (len == 5) _NMOD_VEC_DOT_SHORT2( 5, expr1, expr2) \
if (len == 6) _NMOD_VEC_DOT_SHORT2( 6, expr1, expr2) \
if (len == 7) _NMOD_VEC_DOT_SHORT2( 7, expr1, expr2) \
if (len == 8) _NMOD_VEC_DOT_SHORT2( 8, expr1, expr2) \
if (len == 9) _NMOD_VEC_DOT_SHORT2( 9, expr1, expr2) \
if (len == 10) _NMOD_VEC_DOT_SHORT2(10, expr1, expr2) \
_NMOD_VEC_DOT_SHORT2(11, expr1, expr2) \
} \
\
else if (method == _DOT3) \
{ \
if (len == 1) return nmod_mul((expr1), (expr2), mod); \
if (len == 2) _NMOD_VEC_DOT_SHORT3( 2, expr1, expr2) \
if (len == 3) _NMOD_VEC_DOT_SHORT3( 3, expr1, expr2) \
if (len == 4) _NMOD_VEC_DOT_SHORT3( 4, expr1, expr2) \
if (len == 5) _NMOD_VEC_DOT_SHORT3( 5, expr1, expr2) \
if (len == 6) _NMOD_VEC_DOT_SHORT3( 6, expr1, expr2) \
if (len == 7) _NMOD_VEC_DOT_SHORT3( 7, expr1, expr2) \
if (len == 8) _NMOD_VEC_DOT_SHORT3( 8, expr1, expr2) \
if (len == 9) _NMOD_VEC_DOT_SHORT3( 9, expr1, expr2) \
if (len == 10) _NMOD_VEC_DOT_SHORT3(10, expr1, expr2) \
_NMOD_VEC_DOT_SHORT3(11, expr1, expr2) \
} \
} while(0); \
FLINT_FORCE_INLINE ulong _nmod_vec_dot(nn_srcptr vec1, nn_srcptr vec2, slong len, nmod_t mod, dot_params_t params)
{
if (len <= 11)
{
if (len == 0) return UWORD(0);
slong i = 0;
_NMOD_VEC_DOT_SHORT(i, vec1[i], vec2[i], len, mod, params.method);
}
if (params.method == _DOT1)
return _nmod_vec_dot1(vec1, vec2, len, mod);
#if FLINT_BITS == 64
if (params.method == _DOT2_SPLIT)
return _nmod_vec_dot2_split(vec1, vec2, len, mod, params.pow2_precomp);
#endif
if (params.method == _DOT2)
return _nmod_vec_dot2(vec1, vec2, len, mod);
if (params.method == _DOT3_ACC)
return _nmod_vec_dot3_acc(vec1, vec2, len, mod);
if (params.method == _DOT3)
return _nmod_vec_dot3(vec1, vec2, len, mod);
if (params.method == _DOT2_HALF)
return _nmod_vec_dot2_half(vec1, vec2, len, mod, params.pow2_precomp);
if (params.method == _DOT_POW2)
{
if (mod.n <= UWORD(1) << (FLINT_BITS / 2))
return _nmod_vec_dot1(vec1, vec2, len, mod);
else
return _nmod_vec_dot_pow2(vec1, vec2, len, mod);
}
return UWORD(0);
}
FLINT_FORCE_INLINE ulong _nmod_vec_dot_rev(nn_srcptr vec1, nn_srcptr vec2, slong len, nmod_t mod, dot_params_t params)
{
if (len <= 11)
{
if (len == 0) return UWORD(0);
slong i = 0;
_NMOD_VEC_DOT_SHORT(i, vec1[i], vec2[len-1-i], len, mod, params.method);
}
if (params.method == _DOT1)
return _nmod_vec_dot1_rev(vec1, vec2, len, mod);
#if FLINT_BITS == 64
if (params.method == _DOT2_SPLIT)
return _nmod_vec_dot2_split_rev(vec1, vec2, len, mod, params.pow2_precomp);
#endif
if (params.method == _DOT2)
return _nmod_vec_dot2_rev(vec1, vec2, len, mod);
if (params.method == _DOT3_ACC)
return _nmod_vec_dot3_acc_rev(vec1, vec2, len, mod);
if (params.method == _DOT3)
return _nmod_vec_dot3_rev(vec1, vec2, len, mod);
if (params.method == _DOT2_HALF)
return _nmod_vec_dot2_half_rev(vec1, vec2, len, mod);
if (params.method == _DOT_POW2)
{
if (mod.n <= UWORD(1) << (FLINT_BITS / 2))
return _nmod_vec_dot1_rev(vec1, vec2, len, mod);
else
return _nmod_vec_dot_pow2_rev(vec1, vec2, len, mod);
}
return UWORD(0);
}
FLINT_FORCE_INLINE ulong _nmod_vec_dot_ptr(nn_srcptr vec1, const nn_ptr * vec2, slong offset, slong len, nmod_t mod, dot_params_t params)
{
if (len <= 11)
{
if (len == 0) return UWORD(0);
slong i = 0;
_NMOD_VEC_DOT_SHORT(i, vec1[i], vec2[i][offset], len, mod, params.method);
}
if (params.method == _DOT1)
return _nmod_vec_dot1_ptr(vec1, vec2, offset, len, mod);
#if FLINT_BITS == 64
if (params.method == _DOT2_SPLIT)
return _nmod_vec_dot2_split_ptr(vec1, vec2, offset, len, mod, params.pow2_precomp);
#endif
if (params.method == _DOT2)
return _nmod_vec_dot2_ptr(vec1, vec2, offset, len, mod);
if (params.method == _DOT3_ACC)
return _nmod_vec_dot3_acc_ptr(vec1, vec2, offset, len, mod);
if (params.method == _DOT3)
return _nmod_vec_dot3_ptr(vec1, vec2, offset, len, mod);
if (params.method == _DOT2_HALF)
return _nmod_vec_dot2_half_ptr(vec1, vec2, offset, len, mod);
if (params.method == _DOT_POW2)
{
if (mod.n <= UWORD(1) << (FLINT_BITS / 2))
return _nmod_vec_dot1_ptr(vec1, vec2, offset, len, mod);
else
return _nmod_vec_dot_pow2_ptr(vec1, vec2, offset, len, mod);
}
return UWORD(0);
}
#undef _NMOD_VEC_DOT_SHORT1
#undef _NMOD_VEC_DOT_SHORT2
#undef _NMOD_VEC_DOT_SHORT3
#define _NMOD_VEC_DOT1(res, i, len, expr1, expr2, mod) \
do \
{ \
res = UWORD(0); \
for (i = 0; i < (len); i++) \
res += (expr1) * (expr2); \
NMOD_RED(res, res, mod); \
} while(0);
#if (FLINT_BITS == 64)
#define _NMOD_VEC_DOT2_SPLIT(res, i, len, expr1, expr2, mod, pow2_precomp) \
do \
{ \
ulong dp_lo = 0; \
ulong dp_hi = 0; \
\
for (i = 0; i+7 < (len); ) \
{ \
dp_lo += (expr1) * (expr2); i++; \
dp_lo += (expr1) * (expr2); i++; \
dp_lo += (expr1) * (expr2); i++; \
dp_lo += (expr1) * (expr2); i++; \
dp_lo += (expr1) * (expr2); i++; \
dp_lo += (expr1) * (expr2); i++; \
dp_lo += (expr1) * (expr2); i++; \
dp_lo += (expr1) * (expr2); i++; \
\
dp_hi += dp_lo >> DOT_SPLIT_BITS; \
dp_lo &= DOT_SPLIT_MASK; \
} \
\
for ( ; i < (len); i++) \
dp_lo += (expr1) * (expr2); \
\
res = pow2_precomp * dp_hi + dp_lo; \
NMOD_RED(res, res, mod); \
} while(0);
#endif
#define _NMOD_VEC_DOT2_HALF(res, i, len, expr1, expr2, mod) \
do \
{ \
ulong s0zz = UWORD(0); \
ulong s1zz = UWORD(0); \
for (i = 0; i < (len); i++) \
{ \
const ulong prodzz = (expr1) * (expr2); \
add_ssaaaa(s1zz, s0zz, s1zz, s0zz, 0, prodzz); \
} \
NMOD2_RED2(res, s1zz, s0zz, mod); \
} while(0);
#define _NMOD_VEC_DOT2(res, i, len, expr1, expr2, mod) \
do \
{ \
ulong u0zz = UWORD(0); \
ulong u1zz = UWORD(0); \
\
for (i = 0; i+7 < (len); ) \
{ \
ulong s0zz, s1zz; \
umul_ppmm(s1zz, s0zz, (expr1), (expr2)); \
add_ssaaaa(u1zz, u0zz, u1zz, u0zz, s1zz, s0zz); \
i++; \
umul_ppmm(s1zz, s0zz, (expr1), (expr2)); \
add_ssaaaa(u1zz, u0zz, u1zz, u0zz, s1zz, s0zz); \
i++; \
umul_ppmm(s1zz, s0zz, (expr1), (expr2)); \
add_ssaaaa(u1zz, u0zz, u1zz, u0zz, s1zz, s0zz); \
i++; \
umul_ppmm(s1zz, s0zz, (expr1), (expr2)); \
add_ssaaaa(u1zz, u0zz, u1zz, u0zz, s1zz, s0zz); \
i++; \
umul_ppmm(s1zz, s0zz, (expr1), (expr2)); \
add_ssaaaa(u1zz, u0zz, u1zz, u0zz, s1zz, s0zz); \
i++; \
umul_ppmm(s1zz, s0zz, (expr1), (expr2)); \
add_ssaaaa(u1zz, u0zz, u1zz, u0zz, s1zz, s0zz); \
i++; \
umul_ppmm(s1zz, s0zz, (expr1), (expr2)); \
add_ssaaaa(u1zz, u0zz, u1zz, u0zz, s1zz, s0zz); \
i++; \
umul_ppmm(s1zz, s0zz, (expr1), (expr2)); \
add_ssaaaa(u1zz, u0zz, u1zz, u0zz, s1zz, s0zz); \
i++; \
} \
for ( ; i < (len); i++) \
{ \
ulong s0zz, s1zz; \
umul_ppmm(s1zz, s0zz, (expr1), (expr2)); \
add_ssaaaa(u1zz, u0zz, u1zz, u0zz, s1zz, s0zz); \
} \
\
NMOD2_RED2(res, u1zz, u0zz, mod); \
} while(0);
#define _NMOD_VEC_DOT3_ACC(res, i, len, expr1, expr2, mod) \
do \
{ \
ulong t2zz = UWORD(0); \
ulong t1zz = UWORD(0); \
ulong t0zz = UWORD(0); \
\
for (i = 0; i+7 < (len); ) \
{ \
ulong s0zz, s1zz; \
ulong u0zz = UWORD(0); \
ulong u1zz = UWORD(0); \
umul_ppmm(s1zz, s0zz, (expr1), (expr2)); \
add_ssaaaa(u1zz, u0zz, u1zz, u0zz, s1zz, s0zz); \
i++; \
umul_ppmm(s1zz, s0zz, (expr1), (expr2)); \
add_ssaaaa(u1zz, u0zz, u1zz, u0zz, s1zz, s0zz); \
i++; \
umul_ppmm(s1zz, s0zz, (expr1), (expr2)); \
add_ssaaaa(u1zz, u0zz, u1zz, u0zz, s1zz, s0zz); \
i++; \
umul_ppmm(s1zz, s0zz, (expr1), (expr2)); \
add_ssaaaa(u1zz, u0zz, u1zz, u0zz, s1zz, s0zz); \
i++; \
umul_ppmm(s1zz, s0zz, (expr1), (expr2)); \
add_ssaaaa(u1zz, u0zz, u1zz, u0zz, s1zz, s0zz); \
i++; \
umul_ppmm(s1zz, s0zz, (expr1), (expr2)); \
add_ssaaaa(u1zz, u0zz, u1zz, u0zz, s1zz, s0zz); \
i++; \
umul_ppmm(s1zz, s0zz, (expr1), (expr2)); \
add_ssaaaa(u1zz, u0zz, u1zz, u0zz, s1zz, s0zz); \
i++; \
umul_ppmm(s1zz, s0zz, (expr1), (expr2)); \
add_ssaaaa(u1zz, u0zz, u1zz, u0zz, s1zz, s0zz); \
i++; \
add_sssaaaaaa(t2zz, t1zz, t0zz, \
t2zz, t1zz, t0zz, \
UWORD(0), u1zz, u0zz); \
} \
\
ulong s0zz, s1zz; \
ulong u0zz = UWORD(0); \
ulong u1zz = UWORD(0); \
for ( ; i < (len); i++) \
{ \
umul_ppmm(s1zz, s0zz, (expr1), (expr2)); \
add_ssaaaa(u1zz, u0zz, u1zz, u0zz, s1zz, s0zz); \
} \
\
add_sssaaaaaa(t2zz, t1zz, t0zz, \
t2zz, t1zz, t0zz, \
UWORD(0), u1zz, u0zz); \
\
NMOD_RED(t2zz, t2zz, mod); \
NMOD_RED3(res, t2zz, t1zz, t0zz, mod); \
} while(0);
#define _NMOD_VEC_DOT3(res, i, len, expr1, expr2, mod) \
do \
{ \
ulong t2zz = UWORD(0); \
ulong t1zz = UWORD(0); \
ulong t0zz = UWORD(0); \
for (i = 0; i < (len); i++) \
{ \
ulong s0zz, s1zz; \
umul_ppmm(s1zz, s0zz, (expr1), (expr2)); \
add_sssaaaaaa(t2zz, t1zz, t0zz, \
t2zz, t1zz, t0zz, \
UWORD(0), s1zz, s0zz); \
} \
\
NMOD_RED(t2zz, t2zz, mod); \
NMOD_RED3(res, t2zz, t1zz, t0zz, mod); \
} while(0);
#if (FLINT_BITS == 64)
#define NMOD_VEC_DOT(res, i, len, expr1, expr2, mod, params) \
do \
{ \
res = UWORD(0); \
if (params.method == _DOT1 || params.method == _DOT_POW2) \
_NMOD_VEC_DOT1(res, i, len, expr1, expr2, mod) \
else if (params.method == _DOT2_SPLIT) \
_NMOD_VEC_DOT2_SPLIT(res, i, len, expr1, expr2, mod, \
params.pow2_precomp) \
else if (params.method == _DOT2_HALF) \
_NMOD_VEC_DOT2_HALF(res, i, len, expr1, expr2, mod) \
else if (params.method == _DOT2) \
_NMOD_VEC_DOT2(res, i, len, expr1, expr2, mod) \
else if (params.method == _DOT3_ACC) \
_NMOD_VEC_DOT3_ACC(res, i, len, expr1, expr2, mod) \
else if (params.method == _DOT3) \
_NMOD_VEC_DOT3(res, i, len, expr1, expr2, mod) \
} while(0);
#else
#define NMOD_VEC_DOT(res, i, len, expr1, expr2, mod, params) \
do \
{ \
res = UWORD(0); \
if (params.method == _DOT1 || params.method == _DOT_POW2) \
_NMOD_VEC_DOT1(res, i, len, expr1, expr2, mod) \
else if (params.method == _DOT2_HALF) \
_NMOD_VEC_DOT2_HALF(res, i, len, expr1, expr2, mod) \
else if (params.method == _DOT2) \
_NMOD_VEC_DOT2(res, i, len, expr1, expr2, mod) \
else if (params.method == _DOT3_ACC) \
_NMOD_VEC_DOT3_ACC(res, i, len, expr1, expr2, mod) \
else if (params.method == _DOT3) \
_NMOD_VEC_DOT3(res, i, len, expr1, expr2, mod) \
} while(0);
#endif
#ifdef __cplusplus
}
#endif
#endif