#include <stdint.h>
#include <math.h>
#include "misc.h"
#ifndef CONFIG
#error CONFIG macro not defined
#endif
#define ENABLE_DP
#define ENABLE_SP
#define LOG2VECTLENDP CONFIG
#define VECTLENDP (1 << LOG2VECTLENDP)
#define LOG2VECTLENSP (LOG2VECTLENDP+1)
#define VECTLENSP (1 << LOG2VECTLENSP)
#define ACCURATE_SQRT
#define DFTPRIORITY LOG2VECTLENDP
#define ISANAME "Pure C Array"
typedef union {
uint32_t u[VECTLENDP*2];
uint64_t x[VECTLENDP];
double d[VECTLENDP];
float f[VECTLENDP*2];
int32_t i[VECTLENDP*2];
} versatileVector;
typedef versatileVector vmask;
typedef versatileVector vopmask;
typedef versatileVector vdouble;
typedef versatileVector vint;
typedef versatileVector vfloat;
typedef versatileVector vint2;
typedef union {
uint8_t u[sizeof(long double)*VECTLENDP];
long double ld[VECTLENDP];
} longdoubleVector;
typedef longdoubleVector vmaskl;
typedef longdoubleVector vlongdouble;
#ifdef Sleef_quad2_DEFINED
typedef union {
uint8_t u[sizeof(Sleef_quad)*VECTLENDP];
Sleef_quad q[VECTLENDP];
} quadVector;
typedef quadVector vmaskq;
typedef quadVector vquad;
#endif
static INLINE int vavailability_i(int name) { return -1; }
static INLINE void vprefetch_v_p(const void *ptr) { }
static INLINE int vtestallones_i_vo64(vopmask g) {
int ret = 1; for(int i=0;i<VECTLENDP;i++) ret = ret && g.x[i]; return ret;
}
static INLINE int vtestallones_i_vo32(vopmask g) {
int ret = 1; for(int i=0;i<VECTLENSP;i++) ret = ret && g.u[i]; return ret;
}
static vint2 vloadu_vi2_p(int32_t *p) {
vint2 vi;
for(int i=0;i<VECTLENSP;i++) vi.i[i] = p[i];
return vi;
}
static void vstoreu_v_p_vi2(int32_t *p, vint2 v) {
for(int i=0;i<VECTLENSP;i++) p[i] = v.i[i];
}
static vint vloadu_vi_p(int32_t *p) {
vint vi;
for(int i=0;i<VECTLENDP;i++) vi.i[i] = p[i];
return vi;
}
static void vstoreu_v_p_vi(int32_t *p, vint v) {
for(int i=0;i<VECTLENDP;i++) p[i] = v.i[i];
}
static INLINE vopmask vcast_vo32_vo64(vopmask m) {
vopmask ret;
for(int i=0;i<VECTLENDP;i++) ret.u[i] = m.u[i*2+1];
for(int i=VECTLENDP;i<VECTLENDP*2;i++) ret.u[i] = 0;
return ret;
}
static INLINE vopmask vcast_vo64_vo32(vopmask m) {
vopmask ret;
for(int i=0;i<VECTLENDP;i++) ret.u[i*2] = ret.u[i*2+1] = m.u[i];
return ret;
}
static INLINE vmask vcast_vm_i_i(int h, int l) {
vmask ret;
for(int i=0;i<VECTLENDP;i++) {
ret.u[i*2+0] = l;
ret.u[i*2+1] = h;
}
return ret;
}
static INLINE vint2 vcastu_vi2_vi(vint vi) {
vint2 ret;
for(int i=0;i<VECTLENDP;i++) {
ret.i[i*2+0] = 0;
ret.i[i*2+1] = vi.i[i];
}
return ret;
}
static INLINE vint vcastu_vi_vi2(vint2 vi2) {
vint ret;
for(int i=0;i<VECTLENDP;i++) ret.i[i] = vi2.i[i*2+1];
return ret;
}
static INLINE vint vreinterpretFirstHalf_vi_vi2(vint2 vi2) {
vint ret;
for(int i=0;i<VECTLENDP;i++) ret.i[i] = vi2.i[i];
return ret;
}
static INLINE vint2 vreinterpretFirstHalf_vi2_vi(vint vi) {
vint2 ret;
for(int i=0;i<VECTLENDP;i++) ret.i[i] = vi.i[i];
for(int i=VECTLENDP;i<VECTLENDP*2;i++) ret.i[i] = 0;
return ret;
}
static INLINE vdouble vrev21_vd_vd(vdouble d0) {
vdouble r;
for(int i=0;i<VECTLENDP/2;i++) {
r.d[i*2+0] = d0.d[i*2+1];
r.d[i*2+1] = d0.d[i*2+0];
}
return r;
}
static INLINE vdouble vreva2_vd_vd(vdouble d0) {
vdouble r;
for(int i=0;i<VECTLENDP/2;i++) {
r.d[i*2+0] = d0.d[(VECTLENDP/2-1-i)*2+0];
r.d[i*2+1] = d0.d[(VECTLENDP/2-1-i)*2+1];
}
return r;
}
static INLINE vfloat vrev21_vf_vf(vfloat d0) {
vfloat r;
for(int i=0;i<VECTLENSP/2;i++) {
r.f[i*2+0] = d0.f[i*2+1];
r.f[i*2+1] = d0.f[i*2+0];
}
return r;
}
static INLINE vfloat vreva2_vf_vf(vfloat d0) {
vfloat r;
for(int i=0;i<VECTLENSP/2;i++) {
r.f[i*2+0] = d0.f[(VECTLENSP/2-1-i)*2+0];
r.f[i*2+1] = d0.f[(VECTLENSP/2-1-i)*2+1];
}
return r;
}
static INLINE vdouble vcast_vd_d(double d) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = d; return ret; }
static INLINE vopmask vand_vo_vo_vo (vopmask x, vopmask y) { vopmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] & y.u[i]; return ret; }
static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { vopmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = y.u[i] & ~x.u[i]; return ret; }
static INLINE vopmask vor_vo_vo_vo (vopmask x, vopmask y) { vopmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] | y.u[i]; return ret; }
static INLINE vopmask vxor_vo_vo_vo (vopmask x, vopmask y) { vopmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] ^ y.u[i]; return ret; }
static INLINE vmask vand_vm_vm_vm (vmask x, vmask y) { vmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] & y.u[i]; return ret; }
static INLINE vmask vandnot_vm_vm_vm (vmask x, vmask y) { vmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = y.u[i] & ~x.u[i]; return ret; }
static INLINE vmask vor_vm_vm_vm (vmask x, vmask y) { vmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] | y.u[i]; return ret; }
static INLINE vmask vxor_vm_vm_vm (vmask x, vmask y) { vmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] ^ y.u[i]; return ret; }
static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { vmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] & y.u[i]; return ret; }
static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { vmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = y.u[i] & ~x.u[i]; return ret; }
static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { vmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] | y.u[i]; return ret; }
static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { vmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] ^ y.u[i]; return ret; }
static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { vmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] & y.u[i]; return ret; }
static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { vmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = y.u[i] & ~x.u[i]; return ret; }
static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { vmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] | y.u[i]; return ret; }
static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { vmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] ^ y.u[i]; return ret; }
static INLINE vdouble vsel_vd_vo_vd_vd (vopmask o, vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = (o.u[i] & x.u[i]) | (y.u[i] & ~o.u[i]); return ret; }
static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask o, vint2 x, vint2 y) { vint2 ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = (o.u[i] & x.u[i]) | (y.u[i] & ~o.u[i]); return ret; }
static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));
}
static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
}
static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
}
static INLINE vdouble vcast_vd_vi(vint vi) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = vi.i[i]; return ret; }
static INLINE vint vtruncate_vi_vd(vdouble vd) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = (int)vd.d[i]; return ret; }
static INLINE vint vrint_vi_vd(vdouble vd) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = vd.d[i] > 0 ? (int)(vd.d[i] + 0.5) : (int)(vd.d[i] - 0.5); return ret; }
static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return vcast_vd_vi(vtruncate_vi_vd(vd)); }
static INLINE vdouble vrint_vd_vd(vdouble vd) { return vcast_vd_vi(vrint_vi_vd(vd)); }
static INLINE vint vcast_vi_i(int j) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = j; return ret; }
static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.x[i] == y.x[i] ? -1 : 0; return ret; }
static INLINE vmask vadd64_vo_vm_vm(vmask x, vmask y) { vmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.x[i] + y.x[i]; return ret; }
static INLINE vmask vreinterpret_vm_vd(vdouble vd) { union { vdouble vd; vmask vm; } cnv; cnv.vd = vd; return cnv.vm; }
static INLINE vint2 vreinterpret_vi2_vd(vdouble vd) { union { vdouble vd; vint2 vi2; } cnv; cnv.vd = vd; return cnv.vi2; }
static INLINE vdouble vreinterpret_vd_vi2(vint2 vi) { union { vint2 vi2; vdouble vd; } cnv; cnv.vi2 = vi; return cnv.vd; }
static INLINE vdouble vreinterpret_vd_vm(vmask vm) { union { vmask vm; vdouble vd; } cnv; cnv.vm = vm; return cnv.vd; }
static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] + y.d[i]; return ret; }
static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] - y.d[i]; return ret; }
static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] * y.d[i]; return ret; }
static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] / y.d[i]; return ret; }
static INLINE vdouble vrec_vd_vd(vdouble x) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = 1.0 / x.d[i]; return ret; }
static INLINE vdouble vabs_vd_vd(vdouble d) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = d.x[i] & 0x7fffffffffffffffULL; return ret; }
static INLINE vdouble vneg_vd_vd(vdouble d) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = -d.d[i]; return ret; }
static INLINE vdouble vmla_vd_vd_vd_vd (vdouble x, vdouble y, vdouble z) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] * y.d[i] + z.d[i]; return ret; }
static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] * y.d[i] - z.d[i]; return ret; }
static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] > y.d[i] ? x.d[i] : y.d[i]; return ret; }
static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] < y.d[i] ? x.d[i] : y.d[i]; return ret; }
static INLINE vdouble vposneg_vd_vd(vdouble d) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = (i & 1) == 0 ? d.d[i] : -d.d[i]; return ret; }
static INLINE vdouble vnegpos_vd_vd(vdouble d) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = (i & 1) == 0 ? -d.d[i] : d.d[i]; return ret; }
static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = (i & 1) == 0 ? x.d[i] - y.d[i] : x.d[i] + y.d[i]; return ret; }
static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsubadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.d[i] == y.d[i] ? -1 : 0; return ret; }
static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.d[i] != y.d[i] ? -1 : 0; return ret; }
static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.d[i] < y.d[i] ? -1 : 0; return ret; }
static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.d[i] <= y.d[i] ? -1 : 0; return ret; }
static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.d[i] > y.d[i] ? -1 : 0; return ret; }
static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.d[i] >= y.d[i] ? -1 : 0; return ret; }
static INLINE vint vadd_vi_vi_vi(vint x, vint y) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = x.i[i] + y.i[i]; return ret; }
static INLINE vint vsub_vi_vi_vi(vint x, vint y) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = x.i[i] - y.i[i]; return ret; }
static INLINE vint vneg_vi_vi (vint x) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = -x.i[i]; return ret; }
static INLINE vint vand_vi_vi_vi(vint x, vint y) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = x.i[i] & y.i[i]; return ret; }
static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = y.i[i] & ~x.i[i]; return ret; }
static INLINE vint vor_vi_vi_vi(vint x, vint y) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = x.i[i] | y.i[i]; return ret; }
static INLINE vint vxor_vi_vi_vi(vint x, vint y) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = x.i[i] ^ y.i[i]; return ret; }
static INLINE vint vand_vi_vo_vi(vopmask x, vint y) { return vand_vi_vi_vi(vreinterpretFirstHalf_vi_vi2(x), y); }
static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) { return vandnot_vi_vi_vi(vreinterpretFirstHalf_vi_vi2(x), y); }
static INLINE vint vsll_vi_vi_i(vint x, int c) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = x.i[i] << c; return ret; }
static INLINE vint vsrl_vi_vi_i(vint x, int c) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = ((uint32_t)x.i[i]) >> c; return ret; }
static INLINE vint vsra_vi_vi_i(vint x, int c) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = x.i[i] >> c; return ret; }
static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.u[i] = x.i[i] == y.i[i] ? -1 : 0; return ret; }
static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.u[i] = x.i[i] > y.i[i] ? -1 : 0; return ret; }
static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) {
union { vopmask vo; vint2 vi2; } cnv;
cnv.vo = m;
return vor_vi_vi_vi(vand_vi_vi_vi(vreinterpretFirstHalf_vi_vi2(cnv.vi2), x),
vandnot_vi_vi_vi(vreinterpretFirstHalf_vi_vi2(cnv.vi2), y));
}
static INLINE vopmask visinf_vo_vd(vdouble d) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = (d.d[i] == SLEEF_INFINITY || d.d[i] == -SLEEF_INFINITY) ? -1 : 0; return ret; }
static INLINE vopmask vispinf_vo_vd(vdouble d) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = d.d[i] == SLEEF_INFINITY ? -1 : 0; return ret; }
static INLINE vopmask visminf_vo_vd(vdouble d) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = d.d[i] == -SLEEF_INFINITY ? -1 : 0; return ret; }
static INLINE vopmask visnan_vo_vd(vdouble d) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = d.d[i] != d.d[i] ? -1 : 0; return ret; }
static INLINE vdouble vsqrt_vd_vd(vdouble d) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = sqrt(d.d[i]); return ret; }
#if defined(_MSC_VER)
static INLINE double vcast_d_vd(vdouble v) { return v.d[0]; }
#endif
static INLINE vdouble vload_vd_p(const double *ptr) { return *(vdouble *)ptr; }
static INLINE vdouble vloadu_vd_p(const double *ptr) { vdouble vd; for(int i=0;i<VECTLENDP;i++) vd.d[i] = ptr[i]; return vd; }
static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
vdouble vd;
for(int i=0;i<VECTLENDP;i++) vd.d[i] = ptr[vi.i[i]];
return vd;
}
static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { *(vdouble *)ptr = v; }
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { for(int i=0;i<VECTLENDP;i++) ptr[i] = v.d[i]; }
static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { *(vdouble *)ptr = v; }
static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
for(int i=0;i<VECTLENDP/2;i++) {
*(ptr+(offset + step * i)*2 + 0) = v.d[i*2+0];
*(ptr+(offset + step * i)*2 + 1) = v.d[i*2+1];
}
}
static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vscatter2_v_p_i_i_vd(ptr, offset, step, v); }
static INLINE vint2 vcast_vi2_vm(vmask vm) { union { vint2 vi2; vmask vm; } cnv; cnv.vm = vm; return cnv.vi2; }
static INLINE vmask vcast_vm_vi2(vint2 vi) { union { vint2 vi2; vmask vm; } cnv; cnv.vi2 = vi; return cnv.vm; }
static INLINE vfloat vcast_vf_vi2(vint2 vi) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = vi.i[i]; return ret; }
static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { vint2 ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = (int)vf.f[i]; return ret; }
static INLINE vint2 vrint_vi2_vf(vfloat vf) { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = vf.f[i] > 0 ? (int)(vf.f[i] + 0.5) : (int)(vf.f[i] - 0.5); return ret; }
static INLINE vint2 vcast_vi2_i(int j) { vint2 ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = j; return ret; }
static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return vcast_vf_vi2(vtruncate_vi2_vf(vd)); }
static INLINE vfloat vrint_vf_vf(vfloat vd) { return vcast_vf_vi2(vrint_vi2_vf(vd)); }
static INLINE vfloat vcast_vf_f(float f) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = f; return ret; }
static INLINE vmask vreinterpret_vm_vf(vfloat vf) { union { vfloat vf; vmask vm; } cnv; cnv.vf = vf; return cnv.vm; }
static INLINE vfloat vreinterpret_vf_vm(vmask vm) { union { vfloat vf; vmask vm; } cnv; cnv.vm = vm; return cnv.vf; }
static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { union { vfloat vf; vint2 vi2; } cnv; cnv.vi2 = vi; return cnv.vf; }
static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { union { vfloat vf; vint2 vi2; } cnv; cnv.vf = vf; return cnv.vi2; }
static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] + y.f[i]; return ret; }
static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] - y.f[i]; return ret; }
static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] * y.f[i]; return ret; }
static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] / y.f[i]; return ret; }
static INLINE vfloat vrec_vf_vf (vfloat x) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = 1.0 / x.f[i]; return ret; }
static INLINE vfloat vabs_vf_vf(vfloat x) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] & 0x7fffffff; return ret; }
static INLINE vfloat vneg_vf_vf(vfloat x) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = -x.f[i]; return ret; }
static INLINE vfloat vmla_vf_vf_vf_vf (vfloat x, vfloat y, vfloat z) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] * y.f[i] + z.f[i]; return ret; }
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] * y.f[i] - z.f[i]; return ret; }
static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] > y.f[i] ? x.f[i] : y.f[i]; return ret; }
static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] < y.f[i] ? x.f[i] : y.f[i]; return ret; }
static INLINE vfloat vposneg_vf_vf(vfloat x) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = (i & 1) == 0 ? x.f[i] : -x.f[i]; return ret; }
static INLINE vfloat vnegpos_vf_vf(vfloat x) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = (i & 1) == 0 ? -x.f[i] : x.f[i]; return ret; }
static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = (i & 1) == 0 ? x.f[i] - y.f[i] : x.f[i] + y.f[i]; return ret; }
static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = ((x.f[i] == y.f[i]) ? -1 : 0); return ret; }
static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = ((x.f[i] != y.f[i]) ? -1 : 0); return ret; }
static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = ((x.f[i] < y.f[i]) ? -1 : 0); return ret; }
static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = ((x.f[i] <= y.f[i]) ? -1 : 0); return ret; }
static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = ((x.f[i] > y.f[i]) ? -1 : 0); return ret; }
static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = ((x.f[i] >= y.f[i]) ? -1 : 0); return ret; }
static INLINE vint vadd_vi2_vi2_vi2(vint x, vint y) { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] + y.i[i]; return ret; }
static INLINE vint vsub_vi2_vi2_vi2(vint x, vint y) { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] - y.i[i]; return ret; }
static INLINE vint vneg_vi2_vi2(vint x) { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = -x.i[i]; return ret; }
static INLINE vint vand_vi2_vi2_vi2(vint x, vint y) { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] & y.i[i]; return ret; }
static INLINE vint vandnot_vi2_vi2_vi2(vint x, vint y) { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = y.i[i] & ~x.i[i]; return ret; }
static INLINE vint vor_vi2_vi2_vi2(vint x, vint y) { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] | y.i[i]; return ret; }
static INLINE vint vxor_vi2_vi2_vi2(vint x, vint y) { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] ^ y.i[i]; return ret; }
static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = (o.u[i] & x.u[i]) | (y.u[i] & ~o.u[i]); return ret; }
static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
}
static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
}
static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
}
static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) {
union { vopmask vo; vint2 vi2; } cnv;
cnv.vo = x;
return vand_vi2_vi2_vi2(cnv.vi2, y);
}
static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return vandnot_vi2_vi2_vi2(x, y); }
static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { vint2 ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] << c; return ret; }
static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { vint2 ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = ((uint32_t)x.i[i]) >> c; return ret; }
static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { vint2 ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] >> c; return ret; }
static INLINE vopmask visinf_vo_vf (vfloat d) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = (d.f[i] == SLEEF_INFINITYf || d.f[i] == -SLEEF_INFINITYf) ? -1 : 0; return ret; }
static INLINE vopmask vispinf_vo_vf(vfloat d) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = d.f[i] == SLEEF_INFINITYf ? -1 : 0; return ret; }
static INLINE vopmask visminf_vo_vf(vfloat d) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = d.f[i] == -SLEEF_INFINITYf ? -1 : 0; return ret; }
static INLINE vopmask visnan_vo_vf (vfloat d) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = d.f[i] != d.f[i] ? -1 : 0; return ret; }
static INLINE vopmask veq_vo_vi2_vi2 (vint2 x, vint2 y) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = x.i[i] == y.i[i] ? -1 : 0; return ret; }
static INLINE vopmask vgt_vo_vi2_vi2 (vint2 x, vint2 y) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = x.i[i] > y.i[i] ? -1 : 0; return ret; }
static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] == y.i[i] ? -1 : 0; return ret; }
static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] > y.i[i] ? -1 : 0; return ret; }
static INLINE vfloat vsqrt_vf_vf(vfloat x) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = sqrtf(x.f[i]); return ret; }
#ifdef _MSC_VER
static INLINE float vcast_f_vf(vfloat v) { return v.f[0]; }
#endif
static INLINE vfloat vload_vf_p(const float *ptr) { return *(vfloat *)ptr; }
static INLINE vfloat vloadu_vf_p(const float *ptr) {
vfloat vf;
for(int i=0;i<VECTLENSP;i++) vf.f[i] = ptr[i];
return vf;
}
static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
vfloat vf;
for(int i=0;i<VECTLENSP;i++) vf.f[i] = ptr[vi2.i[i]];
return vf;
}
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { *(vfloat *)ptr = v; }
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) {
for(int i=0;i<VECTLENSP;i++) ptr[i] = v.f[i];
}
static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { *(vfloat *)ptr = v; }
static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
for(int i=0;i<VECTLENSP/2;i++) {
*(ptr+(offset + step * i)*2 + 0) = v.f[i*2+0];
*(ptr+(offset + step * i)*2 + 1) = v.f[i*2+1];
}
}
static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }
static INLINE vlongdouble vcast_vl_l(long double d) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = d; return ret; }
static INLINE vlongdouble vrev21_vl_vl(vlongdouble d0) {
vlongdouble r;
for(int i=0;i<VECTLENDP/2;i++) {
r.ld[i*2+0] = d0.ld[i*2+1];
r.ld[i*2+1] = d0.ld[i*2+0];
}
return r;
}
static INLINE vlongdouble vreva2_vl_vl(vlongdouble d0) {
vlongdouble r;
for(int i=0;i<VECTLENDP/2;i++) {
r.ld[i*2+0] = d0.ld[(VECTLENDP/2-1-i)*2+0];
r.ld[i*2+1] = d0.ld[(VECTLENDP/2-1-i)*2+1];
}
return r;
}
static INLINE vlongdouble vadd_vl_vl_vl(vlongdouble x, vlongdouble y) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = x.ld[i] + y.ld[i]; return ret; }
static INLINE vlongdouble vsub_vl_vl_vl(vlongdouble x, vlongdouble y) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = x.ld[i] - y.ld[i]; return ret; }
static INLINE vlongdouble vmul_vl_vl_vl(vlongdouble x, vlongdouble y) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = x.ld[i] * y.ld[i]; return ret; }
static INLINE vlongdouble vneg_vl_vl(vlongdouble x) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = -x.ld[i]; return ret; }
static INLINE vlongdouble vsubadd_vl_vl_vl(vlongdouble x, vlongdouble y) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = (i & 1) == 0 ? x.ld[i] - y.ld[i] : x.ld[i] + y.ld[i]; return ret; }
static INLINE vlongdouble vmlsubadd_vl_vl_vl_vl(vlongdouble x, vlongdouble y, vlongdouble z) { return vsubadd_vl_vl_vl(vmul_vl_vl_vl(x, y), z); }
static INLINE vlongdouble vposneg_vl_vl(vlongdouble x) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = (i & 1) == 0 ? x.ld[i] : -x.ld[i]; return ret; }
static INLINE vlongdouble vnegpos_vl_vl(vlongdouble x) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = (i & 1) == 0 ? -x.ld[i] : x.ld[i]; return ret; }
static INLINE vlongdouble vload_vl_p(const long double *ptr) { return *(vlongdouble *)ptr; }
static INLINE vlongdouble vloadu_vl_p(const long double *ptr) {
vlongdouble vd;
for(int i=0;i<VECTLENDP;i++) vd.ld[i] = ptr[i];
return vd;
}
static INLINE void vstore_v_p_vl(long double *ptr, vlongdouble v) { *(vlongdouble *)ptr = v; }
static INLINE void vstoreu_v_p_vl(long double *ptr, vlongdouble v) {
for(int i=0;i<VECTLENDP;i++) ptr[i] = v.ld[i];
}
static INLINE void vstream_v_p_vl(long double *ptr, vlongdouble v) { *(vlongdouble *)ptr = v; }
static INLINE void vscatter2_v_p_i_i_vl(long double *ptr, int offset, int step, vlongdouble v) {
for(int i=0;i<VECTLENDP/2;i++) {
*(ptr+(offset + step * i)*2 + 0) = v.ld[i*2+0];
*(ptr+(offset + step * i)*2 + 1) = v.ld[i*2+1];
}
}
static INLINE void vsscatter2_v_p_i_i_vl(long double *ptr, int offset, int step, vlongdouble v) { vscatter2_v_p_i_i_vl(ptr, offset, step, v); }
#ifdef Sleef_quad2_DEFINED
static INLINE vquad vcast_vq_q(Sleef_quad d) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = d; return ret; }
static INLINE vquad vrev21_vq_vq(vquad d0) {
vquad r;
for(int i=0;i<VECTLENDP/2;i++) {
r.q[i*2+0] = d0.q[i*2+1];
r.q[i*2+1] = d0.q[i*2+0];
}
return r;
}
static INLINE vquad vreva2_vq_vq(vquad d0) {
vquad r;
for(int i=0;i<VECTLENDP/2;i++) {
r.q[i*2+0] = d0.q[(VECTLENDP/2-1-i)*2+0];
r.q[i*2+1] = d0.q[(VECTLENDP/2-1-i)*2+1];
}
return r;
}
static INLINE vquad vadd_vq_vq_vq(vquad x, vquad y) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = x.q[i] + y.q[i]; return ret; }
static INLINE vquad vsub_vq_vq_vq(vquad x, vquad y) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = x.q[i] - y.q[i]; return ret; }
static INLINE vquad vmul_vq_vq_vq(vquad x, vquad y) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = x.q[i] * y.q[i]; return ret; }
static INLINE vquad vneg_vq_vq(vquad x) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = -x.q[i]; return ret; }
static INLINE vquad vsubadd_vq_vq_vq(vquad x, vquad y) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = (i & 1) == 0 ? x.q[i] - y.q[i] : x.q[i] + y.q[i]; return ret; }
static INLINE vquad vmlsubadd_vq_vq_vq_vq(vquad x, vquad y, vquad z) { return vsubadd_vq_vq_vq(vmul_vq_vq_vq(x, y), z); }
static INLINE vquad vposneg_vq_vq(vquad x) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = (i & 1) == 0 ? x.q[i] : -x.q[i]; return ret; }
static INLINE vquad vnegpos_vq_vq(vquad x) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = (i & 1) == 0 ? -x.q[i] : x.q[i]; return ret; }
static INLINE vquad vload_vq_p(const Sleef_quad *ptr) { return *(vquad *)ptr; }
static INLINE vquad vloadu_vq_p(const Sleef_quad *ptr) {
vquad vd;
for(int i=0;i<VECTLENDP;i++) vd.q[i] = ptr[i];
return vd;
}
static INLINE void vstore_v_p_vq(Sleef_quad *ptr, vquad v) { *(vquad *)ptr = v; }
static INLINE void vstoreu_v_p_vq(Sleef_quad *ptr, vquad v) {
for(int i=0;i<VECTLENDP;i++) ptr[i] = v.q[i];
}
static INLINE void vstream_v_p_vq(Sleef_quad *ptr, vquad v) { *(vquad *)ptr = v; }
static INLINE void vscatter2_v_p_i_i_vq(Sleef_quad *ptr, int offset, int step, vquad v) {
for(int i=0;i<VECTLENDP/2;i++) {
*(ptr+(offset + step * i)*2 + 0) = v.q[i*2+0];
*(ptr+(offset + step * i)*2 + 1) = v.q[i*2+1];
}
}
static INLINE void vsscatter2_v_p_i_i_vq(Sleef_quad *ptr, int offset, int step, vquad v) { vscatter2_v_p_i_i_vq(ptr, offset, step, v); }
#endif