use std::arch::aarch64::*;
#[inline(always)]
pub(crate) fn vcmulq_f32(lhs: float32x4_t, rhs: float32x4_t) -> float32x4_t {
unsafe {
let temp1 = vtrn1q_f32(rhs, rhs);
let temp2 = vtrn2q_f32(rhs, vnegq_f32(rhs));
let temp3 = vmulq_f32(temp2, lhs);
let temp4 = vrev64q_f32(temp3);
vfmaq_f32(temp4, temp1, lhs)
}
}
#[inline(always)]
pub(crate) fn vcmul_f32(lhs: float32x2_t, rhs: float32x2_t) -> float32x2_t {
unsafe {
let temp1 = vtrn1_f32(rhs, rhs);
let temp2 = vtrn2_f32(rhs, vneg_f32(rhs));
let temp3 = vmul_f32(temp2, lhs);
let temp4 = vrev64_f32(temp3);
vfma_f32(temp4, temp1, lhs)
}
}
#[inline]
#[cfg(feature = "fcma")]
#[target_feature(enable = "fcma")]
pub(crate) fn vfcmulq_conj_f32(lhs: float32x4_t, rhs: float32x4_t) -> float32x4_t {
vcmlaq_rot270_f32(vcmlaq_f32(vdupq_n_f32(0.), rhs, lhs), rhs, lhs)
}
#[inline]
#[cfg(feature = "fcma")]
#[target_feature(enable = "fcma")]
pub(crate) fn vfcmul_conj_f32(lhs: float32x2_t, rhs: float32x2_t) -> float32x2_t {
vcmla_rot270_f32(vcmla_f32(vdup_n_f32(0.), rhs, lhs), rhs, lhs)
}
#[inline(always)]
pub(crate) fn vcmulq_f64(lhs: float64x2_t, rhs: float64x2_t) -> float64x2_t {
unsafe {
let temp = vcombine_f64(vneg_f64(vget_high_f64(lhs)), vget_low_f64(lhs));
let sum = vmulq_laneq_f64::<0>(lhs, rhs);
vfmaq_laneq_f64::<1>(sum, temp, rhs)
}
}
#[inline]
#[cfg(feature = "fcma")]
#[target_feature(enable = "fcma")]
pub(crate) fn vfcmul_conj_f64(lhs: float64x2_t, rhs: float64x2_t) -> float64x2_t {
vcmlaq_rot270_f64(vcmlaq_f64(vdupq_n_f64(0.), rhs, lhs), rhs, lhs)
}