#ifndef HVX_INVERSE_H
#define HVX_INVERSE_H
#include <HAP_farf.h>
#include <math.h>
#include <string.h>
#include <assert.h>
#include <stddef.h>
#include <stdint.h>
#include "hvx-base.h"
static inline HVX_Vector hvx_vec_recip_xp1_O3_unsigned(HVX_Vector vx) {
HVX_Vector p;
p = Q6_Vh_vlut4_VuhPh(vx, 0xFAE6F6D4EE73D6A3ull);
p = Q6_Vh_vmpa_VhVhVuhPuh_sat(p, vx, 0x2E49406159097A14ull);
p = Q6_Vh_vmps_VhVhVuhPuh_sat(p, vx, 0x5DF66B7177AB7FC2ull);
p = Q6_Vh_vmpa_VhVhVuhPuh_sat(p, vx, 0x79E57D427F4E8001ull);
return p; }
static inline HVX_Vector hvx_vec_inverse_f16(HVX_Vector vals) {
HVX_Vector em_mask = Q6_Vh_vsplat_R(0x7FFF);
HVX_Vector avals = Q6_V_vand_VV(vals, em_mask);
HVX_VectorPred is_neg = Q6_Q_vcmp_gt_VhVh(avals, vals);
HVX_VectorPred is_small = Q6_Q_vcmp_gt_VhVh(Q6_Vh_vsplat_R(0x101), avals);
HVX_VectorPair to_qf32 = Q6_Wqf32_vmpy_VhfVhf(avals, Q6_Vh_vsplat_R(0x3C00)); HVX_Vector to_f32_0 = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(to_qf32));
HVX_Vector to_f32_1 = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(to_qf32));
HVX_Vector mant_u16 = Q6_Vh_vshuffo_VhVh(Q6_Vw_vasl_VwR(to_f32_1, 9), Q6_Vw_vasl_VwR(to_f32_0, 9));
HVX_Vector exp_u16 = Q6_Vh_vshuffo_VhVh(to_f32_1, to_f32_0);
exp_u16 = Q6_Vuh_vlsr_VuhR(exp_u16, 7);
HVX_Vector rm = hvx_vec_recip_xp1_O3_unsigned(mant_u16);
HVX_Vector vcl0 = Q6_Vuh_vcl0_Vuh(rm);
HVX_Vector mant_recip = Q6_V_vand_VV(Q6_Vh_vasr_VhR(Q6_Vh_vasl_VhVh(rm, vcl0), 5), Q6_Vh_vsplat_R(0x03FF));
HVX_Vector exp_recip =
Q6_Vh_vsub_VhVh(Q6_Vh_vsub_VhVh(Q6_Vh_vsplat_R(254), exp_u16), Q6_Vh_vsub_VhVh(vcl0, Q6_Vh_vsplat_R(1)));
exp_recip = Q6_Vh_vadd_VhVh_sat(Q6_Vh_vsub_VhVh(exp_recip, Q6_Vh_vsplat_R(127)), Q6_Vh_vsplat_R(15));
exp_recip = Q6_Vh_vasl_VhR(exp_recip, 10);
HVX_Vector recip = Q6_V_vor_VV(exp_recip, mant_recip);
recip = Q6_V_vmux_QVV(is_small, Q6_Vh_vsplat_R(0x7bff), recip);
recip = Q6_V_vandor_VQR(recip, is_neg, 0x80008000);
return recip;
}
static inline HVX_Vector hvx_vec_inverse_f32(HVX_Vector v_sf) {
HVX_Vector inv_aprox_sf = Q6_V_vsplat_R(0x7EEEEBB3);
HVX_Vector two_sf = hvx_vec_splat_f32(2.0);
HVX_Vector i_sf = Q6_Vw_vsub_VwVw(inv_aprox_sf, v_sf);
HVX_Vector r_qf;
r_qf = Q6_Vqf32_vmpy_VsfVsf(
i_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(two_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(i_sf, v_sf)))));
r_qf = Q6_Vqf32_vmpy_Vqf32Vqf32(
r_qf, Q6_Vqf32_vsub_VsfVsf(two_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(r_qf), v_sf))));
r_qf = Q6_Vqf32_vmpy_Vqf32Vqf32(
r_qf, Q6_Vqf32_vsub_VsfVsf(two_sf, Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(r_qf), v_sf))));
return Q6_Vsf_equals_Vqf32(r_qf);
}
static inline HVX_Vector hvx_vec_inverse_f32_guard(HVX_Vector v_sf, HVX_Vector nan_inf_mask) {
HVX_Vector out = hvx_vec_inverse_f32(v_sf);
HVX_Vector masked_out = Q6_V_vand_VV(out, nan_inf_mask);
const HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(nan_inf_mask, masked_out);
return Q6_V_vmux_QVV(pred, Q6_V_vzero(), out);
}
#define hvx_inverse_f32_loop_body(dst_type, src_type, vec_store) \
do { \
dst_type * restrict vdst = (dst_type *) dst; \
src_type * restrict vsrc = (src_type *) src; \
\
const HVX_Vector nan_inf_mask = Q6_V_vsplat_R(0x7f800000); \
\
const uint32_t nvec = n / VLEN_FP32; \
const uint32_t nloe = n % VLEN_FP32; \
\
uint32_t i = 0; \
\
_Pragma("unroll(4)") \
for (; i < nvec; i++) { \
vdst[i] = hvx_vec_inverse_f32_guard(vsrc[i], nan_inf_mask); \
} \
if (nloe) { \
HVX_Vector v = hvx_vec_inverse_f32_guard(vsrc[i], nan_inf_mask); \
vec_store((void *) &vdst[i], nloe * SIZEOF_FP32, v); \
} \
} while(0)
static inline HVX_Vector hvx_vec_inverse_f16_guard(HVX_Vector v_sf, HVX_Vector nan_inf_mask) {
HVX_Vector out = hvx_vec_inverse_f16(v_sf);
HVX_Vector masked_out = Q6_V_vand_VV(out, nan_inf_mask);
const HVX_VectorPred pred = Q6_Q_vcmp_eq_VhVh(nan_inf_mask, masked_out);
return Q6_V_vmux_QVV(pred, Q6_V_vzero(), out);
}
#define hvx_inverse_f16_loop_body(dst_type, src_type, vec_store) \
do { \
dst_type * restrict vdst = (dst_type *) dst; \
src_type * restrict vsrc = (src_type *) src; \
\
const HVX_Vector nan_inf_mask = Q6_Vh_vsplat_R(0x7c00); \
\
const uint32_t nvec = n / VLEN_FP16; \
const uint32_t nloe = n % VLEN_FP16; \
\
uint32_t i = 0; \
\
_Pragma("unroll(4)") \
for (; i < nvec; i++) { \
vdst[i] = hvx_vec_inverse_f16_guard(vsrc[i], nan_inf_mask); \
} \
if (nloe) { \
HVX_Vector v = hvx_vec_inverse_f16_guard(vsrc[i], nan_inf_mask); \
vec_store((void *) &vdst[i], nloe * SIZEOF_FP16, v); \
} \
} while(0)
#define DEFINE_HVX_INV_OP_VARIANTS(OP_NAME, OP_LOOP_BODY) \
static inline void OP_NAME##_aa(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) { \
assert((uintptr_t) dst % 128 == 0); \
assert((uintptr_t) src % 128 == 0); \
OP_LOOP_BODY(HVX_Vector, HVX_Vector, hvx_vec_store_a); \
} \
static inline void OP_NAME##_au(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) { \
assert((uintptr_t) dst % 128 == 0); \
OP_LOOP_BODY(HVX_Vector, HVX_UVector, hvx_vec_store_a); \
} \
static inline void OP_NAME##_ua(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) { \
assert((uintptr_t) src % 128 == 0); \
OP_LOOP_BODY(HVX_UVector, HVX_Vector, hvx_vec_store_u); \
} \
static inline void OP_NAME##_uu(uint8_t * restrict dst, const uint8_t * restrict src, uint32_t n) { \
OP_LOOP_BODY(HVX_UVector, HVX_UVector, hvx_vec_store_u); \
} \
#define HVX_INV_DISPATCHER(OP_NAME) \
static inline void OP_NAME(uint8_t * restrict dst, const uint8_t * restrict src, const uint32_t num_elems) { \
if (hex_is_aligned((void *) dst, 128) && hex_is_aligned((void *) src, 128)) { \
OP_NAME##_aa(dst, src, num_elems); \
} else if (hex_is_aligned((void *) dst, 128)) { \
OP_NAME##_au(dst, src, num_elems); \
} else if (hex_is_aligned((void *) src, 128)) { \
OP_NAME##_ua(dst, src, num_elems); \
} else { \
OP_NAME##_uu(dst, src, num_elems); \
} \
}
DEFINE_HVX_INV_OP_VARIANTS(hvx_inverse_f32, hvx_inverse_f32_loop_body)
DEFINE_HVX_INV_OP_VARIANTS(hvx_inverse_f16, hvx_inverse_f16_loop_body)
HVX_INV_DISPATCHER(hvx_inverse_f32)
HVX_INV_DISPATCHER(hvx_inverse_f16)
#endif