#ifndef HVX_EXP_H
#define HVX_EXP_H
#include <stdbool.h>
#include <stdint.h>
#include <math.h>
#include "hvx-base.h"
#include "hvx-floor.h"
#define EXP_COEFF_5 (0x39506967)
#define EXP_COEFF_4 (0x3AB743CE)
#define EXP_COEFF_3 (0x3C088908)
#define EXP_COEFF_2 (0x3D2AA9C1)
#define EXP_COEFF_1 (0x3E2AAAAA)
#define EXP_COEFF_0 (0x3F000000)
#define EXP_LOGN2 (0x3F317218)
#define EXP_LOG2E (0x3FB8AA3B)
#define EXP_LOG2E_F 1.44269504f
#define EXP_ONE (0x3f800000)
#define EXP_RANGE_R (0x42B17218)
#define EXP_RANGE_L (0xC2B00000)
static inline HVX_Vector hvx_vec_exp_f32(HVX_Vector in_vec) {
HVX_Vector z_qf32_v;
HVX_Vector x_v;
HVX_Vector x_qf32_v;
HVX_Vector y_v;
HVX_Vector k_v;
HVX_Vector f_v;
HVX_Vector epsilon_v;
HVX_Vector log2e = Q6_V_vsplat_R(EXP_LOG2E);
HVX_Vector logn2 = Q6_V_vsplat_R(EXP_LOGN2);
HVX_Vector E_const;
HVX_Vector zero_v = Q6_V_vzero();
HVX_Vector temp_v = in_vec;
HVX_VectorPred pred_cap_right = Q6_Q_vcmp_gt_VsfVsf(in_vec, Q6_V_vsplat_R(EXP_RANGE_R));
HVX_VectorPred pred_cap_left = Q6_Q_vcmp_gt_VsfVsf(Q6_V_vsplat_R(EXP_RANGE_L), in_vec);
in_vec = Q6_V_vmux_QVV(pred_cap_right, Q6_V_vsplat_R(EXP_RANGE_R), temp_v);
in_vec = Q6_V_vmux_QVV(pred_cap_left, Q6_V_vsplat_R(EXP_RANGE_L), in_vec);
epsilon_v = Q6_Vqf32_vmpy_VsfVsf(log2e, in_vec);
epsilon_v = Q6_Vsf_equals_Vqf32(epsilon_v);
f_v = hvx_vec_floor_f32(epsilon_v);
k_v = hvx_vec_truncate_f32(f_v);
x_qf32_v = Q6_Vqf32_vadd_VsfVsf(in_vec, zero_v);
epsilon_v = Q6_Vqf32_vmpy_VsfVsf(f_v, logn2);
x_qf32_v = Q6_Vqf32_vsub_Vqf32Vqf32(x_qf32_v, epsilon_v);
x_qf32_v = Q6_Vqf32_vadd_Vqf32Vsf(x_qf32_v, zero_v);
x_v = Q6_Vsf_equals_Vqf32(x_qf32_v);
z_qf32_v = Q6_Vqf32_vmpy_Vqf32Vqf32(x_qf32_v, x_qf32_v);
z_qf32_v = Q6_Vqf32_vadd_Vqf32Vsf(z_qf32_v, zero_v);
E_const = Q6_V_vsplat_R(EXP_COEFF_5);
y_v = Q6_Vqf32_vmpy_VsfVsf(E_const, x_v);
E_const = Q6_V_vsplat_R(EXP_COEFF_4);
y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const);
y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
E_const = Q6_V_vsplat_R(EXP_COEFF_3);
y_v = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, x_qf32_v);
y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const);
y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
E_const = Q6_V_vsplat_R(EXP_COEFF_2);
y_v = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, x_qf32_v);
y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const);
y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
E_const = Q6_V_vsplat_R(EXP_COEFF_1);
y_v = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, x_qf32_v);
y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const);
y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
E_const = Q6_V_vsplat_R(EXP_COEFF_0);
y_v = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, x_qf32_v);
y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, E_const);
y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
y_v = Q6_Vqf32_vmpy_Vqf32Vqf32(y_v, z_qf32_v);
y_v = Q6_Vqf32_vadd_Vqf32Vqf32(y_v, x_qf32_v);
y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, zero_v);
y_v = Q6_Vqf32_vadd_Vqf32Vsf(y_v, Q6_V_vsplat_R(EXP_ONE));
y_v = Q6_Vsf_equals_Vqf32(y_v);
HVX_Vector y_v_exponent = Q6_Vw_vasl_VwR(y_v, 1);
y_v_exponent = Q6_Vuw_vlsr_VuwR(y_v_exponent, IEEE_VSF_MANTLEN + 1);
y_v_exponent = Q6_Vw_vadd_VwVw(k_v, y_v_exponent);
HVX_VectorPred qy_v_negative_exponent = Q6_Q_vcmp_gt_VwVw(zero_v, y_v_exponent);
y_v = Q6_Vw_vaslacc_VwVwR(y_v, k_v, IEEE_VSF_MANTLEN);
y_v = Q6_V_vmux_QVV(qy_v_negative_exponent, zero_v, y_v);
return y_v;
}
static inline HVX_Vector hvx_vec_exp_f32_guard(HVX_Vector in_vec, HVX_Vector max_exp, HVX_Vector inf) {
const HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VsfVsf(in_vec, max_exp);
HVX_Vector out = hvx_vec_exp_f32(in_vec);
return Q6_V_vmux_QVV(pred0, inf, out);
}
static inline void hvx_exp_f32(uint8_t * restrict dst, const uint8_t * restrict src, const int num_elems, bool negate) {
int left_over = num_elems & (VLEN_FP32 - 1);
int num_elems_whole = num_elems - left_over;
int unaligned_addr = 0;
int unaligned_loop = 0;
if ((0 == hex_is_aligned((void *) src, VLEN)) || (0 == hex_is_aligned((void *) dst, VLEN))) {
unaligned_addr = 1;
}
if ((1 == unaligned_addr) && (num_elems_whole != 0)) {
unaligned_loop = 1;
}
HVX_Vector vec_out = Q6_V_vzero();
static const float kInf = INFINITY;
static const float kMaxExp = 88.7228f;
const HVX_Vector max_exp = hvx_vec_splat_f32(kMaxExp);
const HVX_Vector inf = hvx_vec_splat_f32(kInf);
if (0 == unaligned_loop) {
HVX_Vector * p_vec_in1 = (HVX_Vector *) src;
HVX_Vector * p_vec_out = (HVX_Vector *) dst;
#pragma unroll(4)
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
if (true == negate) {
HVX_Vector neg_vec_in = hvx_vec_neg_f32(*p_vec_in1++);
*p_vec_out++ = hvx_vec_exp_f32_guard(neg_vec_in, max_exp, inf);
} else {
*p_vec_out++ = hvx_vec_exp_f32_guard(*p_vec_in1++, max_exp, inf);
}
}
} else {
#pragma unroll(4)
for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32);
if (true == negate) {
HVX_Vector neg_vec_in = hvx_vec_neg_f32(in);
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_f32_guard(neg_vec_in, max_exp, inf);
} else {
*(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_f32_guard(in, max_exp, inf);
}
}
}
if (left_over > 0) {
const float * srcf = (float *) src + num_elems_whole;
float * dstf = (float *) dst + num_elems_whole;
HVX_Vector in = *(HVX_UVector *) srcf;
if (true == negate) {
HVX_Vector neg_vec_in = hvx_vec_neg_f32(in);
vec_out = hvx_vec_exp_f32_guard(neg_vec_in, max_exp, inf);
} else {
vec_out = hvx_vec_exp_f32_guard(in, max_exp, inf);
}
hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, vec_out);
}
}
static inline HVX_Vector hvx_vec_exp2_f16(HVX_Vector x_v) {
const HVX_Vector zero_v = Q6_V_vzero();
const HVX_Vector half_hf_v = Q6_Vh_vsplat_R(0x3800);
const HVX_Vector v_clamp_min = hvx_vec_splat_f16(-24.0f);
x_v = Q6_Vhf_vmax_VhfVhf(v_clamp_min, x_v);
HVX_Vector x_minus_half = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vsub_VhfVhf(x_v, half_hf_v));
HVX_Vector k_v = Q6_Vh_equals_Vhf(x_minus_half); HVX_Vector f_v = Q6_Vhf_equals_Vh(k_v);
HVX_Vector x_qf16 = Q6_Vqf16_vsub_VhfVhf(x_v, f_v);
HVX_Vector y = Q6_Vqf16_vmpy_Vqf16Vqf16(Q6_Vh_vsplat_R(0x5082), x_qf16); y = Q6_Vqf16_vadd_Vqf16Vhf(y, Q6_Vh_vsplat_R(0x157d)); y = Q6_Vqf16_vmpy_Vqf16Vqf16(y, x_qf16);
y = Q6_Vqf16_vadd_Vqf16Vhf(y, Q6_Vh_vsplat_R(0x20ed)); y = Q6_Vqf16_vmpy_Vqf16Vqf16(y, x_qf16);
y = Q6_Vqf16_vadd_Vqf16Vhf(y, Q6_Vh_vsplat_R(0x2b1b)); y = Q6_Vqf16_vmpy_Vqf16Vqf16(y, x_qf16);
y = Q6_Vqf16_vadd_Vqf16Vhf(y, Q6_Vh_vsplat_R(0x33b0)); y = Q6_Vqf16_vmpy_Vqf16Vqf16(y, x_qf16);
y = Q6_Vqf16_vadd_Vqf16Vhf(y, Q6_Vh_vsplat_R(0x398c)); y = Q6_Vqf16_vmpy_Vqf16Vqf16(y, x_qf16); y = Q6_Vqf16_vadd_Vqf16Vhf(y, Q6_Vh_vsplat_R(0x3c00));
y = Q6_Vhf_equals_Vqf16(y);
HVX_Vector y_exp = Q6_Vuh_vlsr_VuhR(Q6_Vh_vasl_VhR(y, 1), 11);
y_exp = Q6_Vh_vadd_VhVh(k_v, y_exp);
HVX_VectorPred q_underflow = Q6_Q_vcmp_gt_VhVh(zero_v, y_exp);
y = Q6_Vh_vaslacc_VhVhR(y, k_v, 10);
return Q6_V_vmux_QVV(q_underflow, zero_v, y);
}
#endif