pub fn relu_simd(input: &[f32], output: &mut [f32])
SIMD-accelerated ReLU activation with automatic architecture dispatch