pub fn fma_slice_dispatch(a: &[f32], b: &[f32], acc: &mut [f32])
Fused multiply-accumulate: acc[i] += a[i] * b[i].
acc[i] += a[i] * b[i]