#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
use crate::optimizer::scalar;
pub fn add_avx_impl(a: &[f32], b: &[f32], out: &mut [f32]) {
let len = a.len().min(b.len()).min(out.len());
let simd_len = len / 8;
let main_loop_len = simd_len * 8;
#[cfg(target_arch = "x86_64")]
{
unsafe {
for i in (0..main_loop_len).step_by(8) {
let va = _mm256_loadu_ps(a.as_ptr().add(i));
let vb = _mm256_loadu_ps(b.as_ptr().add(i));
let vres = _mm256_add_ps(va, vb);
_mm256_storeu_ps(out.as_mut_ptr().add(i), vres);
}
}
}
if main_loop_len < len {
scalar::add_impl(
&a[main_loop_len..len],
&b[main_loop_len..len],
&mut out[main_loop_len..len],
);
}
}