use core::arch::aarch64::{float64x2_t, vaddq_f64, vaddvq_f64, vdupq_n_f64, vfmaq_f64, vld1q_f64};
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn dot(a: &[f64], b: &[f64]) -> f64 {
debug_assert_eq!(a.len(), b.len(), "neon::dot: length mismatch");
let n = a.len();
unsafe {
let mut acc0: float64x2_t = vdupq_n_f64(0.0);
let mut acc1: float64x2_t = vdupq_n_f64(0.0);
let mut i = 0usize;
while i + 4 <= n {
let a0 = vld1q_f64(a.as_ptr().add(i));
let b0 = vld1q_f64(b.as_ptr().add(i));
let a1 = vld1q_f64(a.as_ptr().add(i + 2));
let b1 = vld1q_f64(b.as_ptr().add(i + 2));
acc0 = vfmaq_f64(acc0, a0, b0);
acc1 = vfmaq_f64(acc1, a1, b1);
i += 4;
}
if i + 2 <= n {
let a0 = vld1q_f64(a.as_ptr().add(i));
let b0 = vld1q_f64(b.as_ptr().add(i));
acc0 = vfmaq_f64(acc0, a0, b0);
i += 2;
}
let acc = vaddq_f64(acc0, acc1);
let mut sum = vaddvq_f64(acc);
while i < n {
sum = f64::mul_add(*a.get_unchecked(i), *b.get_unchecked(i), sum);
i += 1;
}
sum
}
}
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn sum_of_squares(v: &[f64]) -> f64 {
let n = v.len();
unsafe {
let mut acc0: float64x2_t = vdupq_n_f64(0.0);
let mut acc1: float64x2_t = vdupq_n_f64(0.0);
let mut i = 0usize;
while i + 4 <= n {
let v0 = vld1q_f64(v.as_ptr().add(i));
let v1 = vld1q_f64(v.as_ptr().add(i + 2));
acc0 = vfmaq_f64(acc0, v0, v0);
acc1 = vfmaq_f64(acc1, v1, v1);
i += 4;
}
if i + 2 <= n {
let v0 = vld1q_f64(v.as_ptr().add(i));
acc0 = vfmaq_f64(acc0, v0, v0);
i += 2;
}
let acc = vaddq_f64(acc0, acc1);
let mut sum = vaddvq_f64(acc);
while i < n {
let x = *v.get_unchecked(i);
sum = f64::mul_add(x, x, sum);
i += 1;
}
sum
}
}