pub fn rms_norm(output: &mut [f32], input: &[f32], weight: &[f32], eps: f32)
NEON-accelerated RMSNorm: output = (input / rms(input)) * weight