#[cfg(target_arch = "aarch64")]
use std::arch::aarch64::*;
const I8_LANES: usize = 16;
#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "neon")]
pub unsafe fn i8xi8_dot_i32(a: *const i8, b: *const i8, len: usize) -> i32 {
let chunks = len / I8_LANES;
let remainder = len % I8_LANES;
let mut acc = vdupq_n_s32(0);
for i in 0..chunks {
let offset = i * I8_LANES;
let va = vld1q_s8(a.add(offset));
let vb = vld1q_s8(b.add(offset));
let prod_lo = vmull_s8(vget_low_s8(va), vget_low_s8(vb));
let prod_hi = vmull_s8(vget_high_s8(va), vget_high_s8(vb));
acc = vpadalq_s16(acc, prod_lo);
acc = vpadalq_s16(acc, prod_hi);
}
let mut result = vaddvq_s32(acc);
for i in 0..remainder {
let offset = chunks * I8_LANES + i;
result += (*a.add(offset) as i32) * (*b.add(offset) as i32);
}
result
}