pub fn dot_product_simd(a: &[f32], b: &[f32]) -> f32
SIMD-accelerated dot product with automatic architecture dispatch