pub fn max_pool_2x2_simd( input: &[f32], output: &mut [f32], h: usize, w: usize, c: usize, stride: usize, )
SIMD-accelerated max pooling 2x2