use crate::neon::util::vfcmul_conj_f64;
use crate::spetrum_arith::SpectrumArithmetic;
use num_complex::Complex;
use std::arch::aarch64::*;
#[derive(Copy, Clone, Default, Debug)]
pub(crate) struct FcmaSpectrumF64 {}
impl SpectrumArithmetic<f64> for FcmaSpectrumF64 {
fn mul_by_b_conj_normalize(
&self,
dst: &mut [Complex<f64>],
input: &[Complex<f64>],
other: &[Complex<f64>],
normalize_value: f64,
) {
unsafe { self.mul_by_b_conj_normalize_impl(dst, input, other, normalize_value) }
}
}
impl FcmaSpectrumF64 {
#[target_feature(enable = "fcma")]
fn mul_by_b_conj_normalize_impl(
&self,
dst: &mut [Complex<f64>],
input: &[Complex<f64>],
other: &[Complex<f64>],
normalize_value: f64,
) {
unsafe {
let v_norm_factor = vdupq_n_f64(normalize_value);
for ((dst, input), other) in dst
.chunks_exact_mut(4)
.zip(input.chunks_exact(4))
.zip(other.chunks_exact(4))
{
let vd0 = vld1q_f64(input.as_ptr().cast());
let vd1 = vld1q_f64(input.get_unchecked(1..).as_ptr().cast());
let vd2 = vld1q_f64(input.get_unchecked(2..).as_ptr().cast());
let vd3 = vld1q_f64(input.get_unchecked(3..).as_ptr().cast());
let vk0 = vld1q_f64(other.as_ptr().cast());
let vk1 = vld1q_f64(other.get_unchecked(1..).as_ptr().cast());
let vk2 = vld1q_f64(other.get_unchecked(2..).as_ptr().cast());
let vk3 = vld1q_f64(other.get_unchecked(3..).as_ptr().cast());
let p0 = vmulq_f64(vfcmul_conj_f64(vd0, vk0), v_norm_factor);
let p1 = vmulq_f64(vfcmul_conj_f64(vd1, vk1), v_norm_factor);
let p2 = vmulq_f64(vfcmul_conj_f64(vd2, vk2), v_norm_factor);
let p3 = vmulq_f64(vfcmul_conj_f64(vd3, vk3), v_norm_factor);
vst1q_f64(dst.as_mut_ptr().cast(), p0);
vst1q_f64(dst.get_unchecked_mut(1..).as_mut_ptr().cast(), p1);
vst1q_f64(dst.get_unchecked_mut(2..).as_mut_ptr().cast(), p2);
vst1q_f64(dst.get_unchecked_mut(3..).as_mut_ptr().cast(), p3);
}
let dst_rem = dst.chunks_exact_mut(4).into_remainder();
let other_rem = other.chunks_exact(4).remainder();
let input_rem = input.chunks_exact(4).remainder();
for ((dst, input), other) in dst_rem
.iter_mut()
.zip(input_rem.iter())
.zip(other_rem.iter())
{
let v0 = vld1q_f64(input as *const Complex<f64> as *const f64);
let v1 = vld1q_f64(other as *const Complex<f64> as *const f64);
let p1 = vmulq_f64(vfcmul_conj_f64(v0, v1), v_norm_factor);
vst1q_f64(dst as *mut Complex<f64> as *mut f64, p1);
}
}
}
}