pub fn elementwise_mul(output: &mut [f32], a: &[f32], b: &[f32])
NEON-accelerated elementwise multiply