#ifndef _CLAMP_H
#define _CLAMP_H
#ifndef INLINE
#define INLINE
#endif
#include "vu.h"
static INLINE void merge(short* VD, short* cmp, short* pass, short* fail)
{
register int i;
#ifdef ARCH_MIN_ARM_NEON
int16x8_t p,f,d,c,vd,temp;
p = vld1q_s16((const int16_t*)pass);
f = vld1q_s16((const int16_t*)fail);
c = vld1q_s16((const int16_t*)cmp);
d = vsubq_s16(p,f);
vd = vmlaq_s16(f, c, d); vst1q_s16(VD, vd);
return;
#else
#if (0)
for (i = 0; i < N; i++)
VD[i] = (cmp[i] != 0) ? pass[i] : fail[i];
#else
ALIGNED short diff[N];
for (i = 0; i < N; i++)
diff[i] = pass[i] - fail[i];
for (i = 0; i < N; i++)
VD[i] = fail[i] + cmp[i]*diff[i];
#endif
return;
#endif
}
#ifdef ARCH_MIN_ARM_NEON
INLINE void vector_copy(short * VD, short * VS)
{
int16x8_t xmm;
xmm = vld1q_s16((const int16_t*)VS);
vst1q_s16(VD, xmm);
return;
}
static INLINE void SIGNED_CLAMP_ADD(usf_state_t * state, short* VD, short* VS, short* VT)
{
int16x8_t dst, src, vco, max, min;
src = vld1q_s16((const int16_t*)VS);
dst = vld1q_s16((const int16_t*)VT);
vco = vld1q_s16((const int16_t*)state->co);
max = vmaxq_s16(dst, src);
min = vminq_s16(dst, src);
min = vqaddq_s16(min, vco);
max = vqaddq_s16(max, min);
vst1q_s16(VD, max);
return;
}
static INLINE void SIGNED_CLAMP_SUB(usf_state_t * state, short* VD, short* VS, short* VT)
{
int16x8_t dst, src, vco, dif, res, xmm,vd;
src = vld1q_s16((const int16_t*)VS);
vd = vld1q_s16((const int16_t*)VD);
dst = vld1q_s16((const int16_t*)VT);
vco = vld1q_s16((const int16_t*)state->co);
res = vqsubq_s16(src, dst);
dif = vaddq_s16(res, vco);
dif = veorq_s16(dif, res);
dif = vandq_s16(dif, dst);
xmm = vsubq_s16(src, dst);
src = vbicq_s16(dif, src);
xmm = vandq_s16(xmm, src);
xmm = vshrq_n_s16(xmm, 15);
xmm = vbicq_s16(vco, xmm);
res = vqsubq_s16(res, xmm);
vst1q_s16(VD, res);
return;
}
static INLINE void SIGNED_CLAMP_AM(usf_state_t * state, short* VD)
{
int16x8_t pvs, pvd;
int16x8x2_t packed;
int16x8_t result;
int16x4_t low, high;
pvs = vld1q_s16((const int16_t*)VACC_H);
pvd = vld1q_s16((const int16_t*)VACC_M);
packed = vzipq_s16(pvd,pvs);
low = vqmovn_s32((int32x4_t)packed.val[0]);
high = vqmovn_s32((int32x4_t)packed.val[1]);
result = vcombine_s16(low,high);
vst1q_s16(VD,result);
return;
}
#endif
#if !defined ARCH_MIN_SSE2 && !defined ARCH_MIN_ARM_NEON
INLINE void vector_copy(short* VD, short* VS)
{
#if (0)
memcpy(VD, VS, N*sizeof(short));
#else
register int i;
for (i = 0; i < N; i++)
VD[i] = VS[i];
#endif
return;
}
static INLINE void SIGNED_CLAMP_ADD(usf_state_t * state, short* VD, short* VS, short* VT)
{
ALIGNED int32_t sum[N];
ALIGNED short hi[N], lo[N];
register int i;
for (i = 0; i < N; i++)
sum[i] = VS[i] + VT[i] + state->co[i];
for (i = 0; i < N; i++)
lo[i] = (sum[i] + 0x8000) >> 31;
for (i = 0; i < N; i++)
hi[i] = (0x7FFF - sum[i]) >> 31;
vector_copy(VD, VACC_L);
for (i = 0; i < N; i++)
VD[i] &= ~lo[i];
for (i = 0; i < N; i++)
VD[i] |= hi[i];
for (i = 0; i < N; i++)
VD[i] ^= 0x8000 & (hi[i] | lo[i]);
return;
}
static INLINE void SIGNED_CLAMP_SUB(usf_state_t * state, short* VD, short* VS, short* VT)
{
ALIGNED int32_t dif[N];
ALIGNED short hi[N], lo[N];
register int i;
for (i = 0; i < N; i++)
dif[i] = VS[i] - VT[i] - state->co[i];
for (i = 0; i < N; i++)
lo[i] = (dif[i] + 0x8000) >> 31;
for (i = 0; i < N; i++)
hi[i] = (0x7FFF - dif[i]) >> 31;
vector_copy(VD, VACC_L);
for (i = 0; i < N; i++)
VD[i] &= ~lo[i];
for (i = 0; i < N; i++)
VD[i] |= hi[i];
for (i = 0; i < N; i++)
VD[i] ^= 0x8000 & (hi[i] | lo[i]);
return;
}
static INLINE void SIGNED_CLAMP_AM(usf_state_t * state, short* VD)
{
ALIGNED short hi[N], lo[N];
register int i;
for (i = 0; i < N; i++)
lo[i] = (VACC_H[i] < ~0);
for (i = 0; i < N; i++)
lo[i] |= (VACC_H[i] < 0) & !(VACC_M[i] < 0);
for (i = 0; i < N; i++)
hi[i] = (VACC_H[i] > 0);
for (i = 0; i < N; i++)
hi[i] |= (VACC_H[i] == 0) & (VACC_M[i] < 0);
vector_copy(VD, VACC_M);
for (i = 0; i < N; i++)
VD[i] &= -(lo[i] ^ 1);
for (i = 0; i < N; i++)
VD[i] |= -(hi[i] ^ 0);
for (i = 0; i < N; i++)
VD[i] ^= 0x8000 * (hi[i] | lo[i]);
return;
}
#endif
#ifdef ARCH_MIN_SSE2
INLINE void vector_copy(short* VD, short* VS)
{
__m128i xmm;
xmm = _mm_load_si128((__m128i *)VS);
_mm_store_si128((__m128i *)VD, xmm);
return;
}
static INLINE void SIGNED_CLAMP_ADD(usf_state_t * state, short* VD, short* VS, short* VT)
{
__m128i dst, src, vco;
__m128i max, min;
src = _mm_load_si128((__m128i *)VS);
dst = _mm_load_si128((__m128i *)VT);
vco = _mm_load_si128((__m128i *)state->co);
max = _mm_max_epi16(dst, src);
min = _mm_min_epi16(dst, src);
min = _mm_adds_epi16(min, vco);
max = _mm_adds_epi16(max, min);
_mm_store_si128((__m128i *)VD, max);
return;
}
static INLINE void SIGNED_CLAMP_SUB(usf_state_t * state, short* VD, short* VS, short* VT)
{
__m128i dst, src, vco;
__m128i dif, res, xmm;
src = _mm_load_si128((__m128i *)VS);
dst = _mm_load_si128((__m128i *)VT);
vco = _mm_load_si128((__m128i *)state->co);
res = _mm_subs_epi16(src, dst);
dif = _mm_add_epi16(res, vco);
dif = _mm_xor_si128(dif, res);
dif = _mm_and_si128(dif, dst);
xmm = _mm_sub_epi16(src, dst);
src = _mm_andnot_si128(src, dif);
xmm = _mm_and_si128(xmm, src);
xmm = _mm_srli_epi16(xmm, 15);
xmm = _mm_andnot_si128(xmm, vco);
res = _mm_subs_epi16(res, xmm);
_mm_store_si128((__m128i *)VD, res);
return;
}
static INLINE void SIGNED_CLAMP_AM(usf_state_t * state, short* VD)
{
__m128i dst, src;
__m128i pvd, pvs;
pvs = _mm_load_si128((__m128i *)VACC_H);
pvd = _mm_load_si128((__m128i *)VACC_M);
dst = _mm_unpacklo_epi16(pvd, pvs);
src = _mm_unpackhi_epi16(pvd, pvs);
dst = _mm_packs_epi32(dst, src);
_mm_store_si128((__m128i *)VD, dst);
return;
}
#endif
static INLINE void UNSIGNED_CLAMP(usf_state_t * state, short* VD)
{
ALIGNED short cond[N];
ALIGNED short temp[N];
register int i;
#ifdef ARCH_MIN_ARM_NEON
uint16x8_t c;
int16x8_t t = vld1q_s16((const int16_t*)temp);
int16x8_t vaccm = vld1q_s16((const int16_t*)VACC_M);
SIGNED_CLAMP_AM(state, temp);
c = vcgtq_s16(t,vaccm);
int16x8_t t_ = vshrq_n_s16(t,15);
int16x8_t vd = vbicq_s16(t,t_);
vd = vorrq_s16(vd,(int16x8_t)c);
vst1q_s16(VD, vd);
return;
#else
SIGNED_CLAMP_AM(state, temp);
for (i = 0; i < N; i++)
cond[i] = -(temp[i] > VACC_M[i]);
for (i = 0; i < N; i++)
VD[i] = temp[i] & ~(temp[i] >> 15);
for (i = 0; i < N; i++)
VD[i] = VD[i] | cond[i];
return;
#endif
}
static INLINE void SIGNED_CLAMP_AL(usf_state_t * state, short* VD)
{
ALIGNED short cond[N];
ALIGNED short temp[N];
register int i;
#ifdef ARCH_MIN_ARM_NEON
SIGNED_CLAMP_AM(state, temp);
uint16x8_t c;
int16x8_t eightk = vdupq_n_s16(0x8000);
uint16x8_t one = vdupq_n_u16(1);
int16x8_t t = vld1q_s16((const int16_t*)temp);
int16x8_t vaccm = vld1q_s16((const int16_t*)VACC_M);
c = vceqq_s16(t,vaccm);
c = vaddq_u16(c, one);
t = veorq_s16(t, eightk);
vst1q_u16(cond,c);
vst1q_s16(temp,t);
merge(VD, cond, temp, VACC_L);
return;
#else
SIGNED_CLAMP_AM(state, temp);
for (i = 0; i < N; i++)
cond[i] = (temp[i] != VACC_M[i]);
for (i = 0; i < N; i++)
temp[i] ^= 0x8000;
merge(VD, cond, temp, VACC_L);
return;
#endif
}
#endif